View Javadoc

1   /**
2    * Copyright 2008 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.mapreduce;
21  
22  import java.io.ByteArrayInputStream;
23  import java.io.ByteArrayOutputStream;
24  import java.io.DataInputStream;
25  import java.io.DataOutputStream;
26  import java.io.IOException;
27  import java.net.URL;
28  import java.net.URLDecoder;
29  import java.util.ArrayList;
30  import java.util.Enumeration;
31  import java.util.HashSet;
32  import java.util.List;
33  import java.util.Set;
34  
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.apache.hadoop.fs.FileSystem;
38  import org.apache.hadoop.fs.Path;
39  import org.apache.hadoop.hbase.HBaseConfiguration;
40  import org.apache.hadoop.hbase.HConstants;
41  import org.apache.hadoop.hbase.client.HTable;
42  import org.apache.hadoop.hbase.client.Scan;
43  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
44  import org.apache.hadoop.hbase.util.Base64;
45  import org.apache.hadoop.io.Writable;
46  import org.apache.hadoop.io.WritableComparable;
47  import org.apache.hadoop.mapreduce.Job;
48  import org.apache.hadoop.util.StringUtils;
49  import org.apache.hadoop.conf.Configuration;
50  import org.apache.zookeeper.ZooKeeper;
51  
52  import com.google.common.base.Function;
53  
54  /**
55   * Utility for {@link TableMapper} and {@link TableReducer}
56   */
57  @SuppressWarnings("unchecked")
58  public class TableMapReduceUtil {
59    static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
60    
61    /**
62     * Use this before submitting a TableMap job. It will appropriately set up
63     * the job.
64     *
65     * @param table  The table name to read from.
66     * @param scan  The scan instance with the columns, time range etc.
67     * @param mapper  The mapper class to use.
68     * @param outputKeyClass  The class of the output key.
69     * @param outputValueClass  The class of the output value.
70     * @param job  The current job to adjust.
71     * @throws IOException When setting up the details fails.
72     */
73    public static void initTableMapperJob(String table, Scan scan,
74        Class<? extends TableMapper> mapper,
75        Class<? extends WritableComparable> outputKeyClass,
76        Class<? extends Writable> outputValueClass, Job job) throws IOException {
77      job.setInputFormatClass(TableInputFormat.class);
78      if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
79      if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
80      job.setMapperClass(mapper);
81      job.getConfiguration().set(TableInputFormat.INPUT_TABLE, table);
82      job.getConfiguration().set(TableInputFormat.SCAN,
83        convertScanToString(scan));
84    }
85  
86    /**
87     * Writes the given scan into a Base64 encoded string.
88     *
89     * @param scan  The scan to write out.
90     * @return The scan saved in a Base64 encoded string.
91     * @throws IOException When writing the scan fails.
92     */
93    static String convertScanToString(Scan scan) throws IOException {
94      ByteArrayOutputStream out = new ByteArrayOutputStream();
95      DataOutputStream dos = new DataOutputStream(out);
96      scan.write(dos);
97      return Base64.encodeBytes(out.toByteArray());
98    }
99  
100   /**
101    * Converts the given Base64 string back into a Scan instance.
102    *
103    * @param base64  The scan details.
104    * @return The newly created Scan instance.
105    * @throws IOException When reading the scan instance fails.
106    */
107   static Scan convertStringToScan(String base64) throws IOException {
108     ByteArrayInputStream bis = new ByteArrayInputStream(Base64.decode(base64));
109     DataInputStream dis = new DataInputStream(bis);
110     Scan scan = new Scan();
111     scan.readFields(dis);
112     return scan;
113   }
114 
115   /**
116    * Use this before submitting a TableReduce job. It will
117    * appropriately set up the JobConf.
118    *
119    * @param table  The output table.
120    * @param reducer  The reducer class to use.
121    * @param job  The current job to adjust.
122    * @throws IOException When determining the region count fails.
123    */
124   public static void initTableReducerJob(String table,
125     Class<? extends TableReducer> reducer, Job job)
126   throws IOException {
127     initTableReducerJob(table, reducer, job, null);
128   }
129 
130   /**
131    * Use this before submitting a TableReduce job. It will
132    * appropriately set up the JobConf.
133    *
134    * @param table  The output table.
135    * @param reducer  The reducer class to use.
136    * @param job  The current job to adjust.
137    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
138    * default partitioner.
139    * @throws IOException When determining the region count fails.
140    */
141   public static void initTableReducerJob(String table,
142     Class<? extends TableReducer> reducer, Job job,
143     Class partitioner) throws IOException {
144     initTableReducerJob(table, reducer, job, partitioner, null, null, null);
145   }
146 
147   /**
148    * Use this before submitting a TableReduce job. It will
149    * appropriately set up the JobConf.
150    *
151    * @param table  The output table.
152    * @param reducer  The reducer class to use.
153    * @param job  The current job to adjust.
154    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
155    * default partitioner.
156    * @param quorumAddress Distant cluster to write to
157    * @param serverClass redefined hbase.regionserver.class
158    * @param serverImpl redefined hbase.regionserver.impl
159    * @throws IOException When determining the region count fails.
160    */
161   public static void initTableReducerJob(String table,
162     Class<? extends TableReducer> reducer, Job job,
163     Class partitioner, String quorumAddress, String serverClass,
164     String serverImpl) throws IOException {
165 
166     Configuration conf = job.getConfiguration();
167     job.setOutputFormatClass(TableOutputFormat.class);
168     if (reducer != null) job.setReducerClass(reducer);
169     conf.set(TableOutputFormat.OUTPUT_TABLE, table);
170     if (quorumAddress != null) {
171       if (quorumAddress.split(":").length == 2) {
172         conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress);
173       } else {
174         throw new IOException("Please specify the peer cluster as " +
175             HConstants.ZOOKEEPER_QUORUM+":"+HConstants.ZOOKEEPER_ZNODE_PARENT);
176       }
177     }
178     if (serverClass != null && serverImpl != null) {
179       conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
180       conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
181     }
182     job.setOutputKeyClass(ImmutableBytesWritable.class);
183     job.setOutputValueClass(Writable.class);
184     if (partitioner == HRegionPartitioner.class) {
185       HBaseConfiguration.addHbaseResources(conf);
186       job.setPartitionerClass(HRegionPartitioner.class);
187       HTable outputTable = new HTable(conf, table);
188       int regions = outputTable.getRegionsInfo().size();
189       if (job.getNumReduceTasks() > regions) {
190         job.setNumReduceTasks(outputTable.getRegionsInfo().size());
191       }
192     } else if (partitioner != null) {
193       job.setPartitionerClass(partitioner);
194     }
195   }
196 
197   /**
198    * Ensures that the given number of reduce tasks for the given job
199    * configuration does not exceed the number of regions for the given table.
200    *
201    * @param table  The table to get the region count for.
202    * @param job  The current job to adjust.
203    * @throws IOException When retrieving the table details fails.
204    */
205   public static void limitNumReduceTasks(String table, Job job)
206   throws IOException {
207     HTable outputTable = new HTable(job.getConfiguration(), table);
208     int regions = outputTable.getRegionsInfo().size();
209     if (job.getNumReduceTasks() > regions)
210       job.setNumReduceTasks(regions);
211   }
212 
213   /**
214    * Sets the number of reduce tasks for the given job configuration to the
215    * number of regions the given table has.
216    *
217    * @param table  The table to get the region count for.
218    * @param job  The current job to adjust.
219    * @throws IOException When retrieving the table details fails.
220    */
221   public static void setNumReduceTasks(String table, Job job)
222   throws IOException {
223     HTable outputTable = new HTable(job.getConfiguration(), table);
224     int regions = outputTable.getRegionsInfo().size();
225     job.setNumReduceTasks(regions);
226   }
227 
228   /**
229    * Sets the number of rows to return and cache with each scanner iteration.
230    * Higher caching values will enable faster mapreduce jobs at the expense of
231    * requiring more heap to contain the cached rows.
232    *
233    * @param job The current job to adjust.
234    * @param batchSize The number of rows to return in batch with each scanner
235    * iteration.
236    */
237   public static void setScannerCaching(Job job, int batchSize) {
238     job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
239   }
240 
241   /**
242    * Add the HBase dependency jars as well as jars for any of the configured
243    * job classes to the job configuration, so that JobClient will ship them
244    * to the cluster and add them to the DistributedCache.
245    */
246   public static void addDependencyJars(Job job) throws IOException {
247     try {
248       addDependencyJars(job.getConfiguration(),
249           ZooKeeper.class,
250           Function.class, // Guava collections
251           job.getMapOutputKeyClass(),
252           job.getMapOutputValueClass(),
253           job.getOutputKeyClass(),
254           job.getOutputValueClass(),
255           job.getOutputFormatClass(),
256           job.getPartitionerClass(),
257           job.getCombinerClass());
258     } catch (ClassNotFoundException e) {
259       throw new IOException(e);
260     }    
261   }
262   
263   /**
264    * Add the jars containing the given classes to the job's configuration
265    * such that JobClient will ship them to the cluster and add them to
266    * the DistributedCache.
267    */
268   public static void addDependencyJars(Configuration conf,
269       Class... classes) throws IOException {
270     
271     FileSystem localFs = FileSystem.getLocal(conf);
272 
273     Set<String> jars = new HashSet<String>();
274     for (Class clazz : classes) {
275       if (clazz == null) continue;
276       
277       String pathStr = findContainingJar(clazz);
278       if (pathStr == null) {
279         LOG.warn("Could not find jar for class " + clazz +
280             " in order to ship it to the cluster.");
281         continue;
282       }
283       Path path = new Path(pathStr);
284       if (!localFs.exists(path)) {
285         LOG.warn("Could not validate jar file " + path + " for class "
286             + clazz);
287         continue;
288       }
289       jars.add(path.makeQualified(localFs).toString());
290     }
291     if (jars.isEmpty()) return;
292     
293     String tmpJars = conf.get("tmpjars");
294     if (tmpJars == null) {
295       tmpJars = StringUtils.arrayToString(jars.toArray(new String[0]));
296     } else {
297       tmpJars += "," + StringUtils.arrayToString(jars.toArray(new String[0]));
298     }
299     conf.set("tmpjars", tmpJars);
300   }
301   
302   /** 
303    * Find a jar that contains a class of the same name, if any.
304    * It will return a jar file, even if that is not the first thing
305    * on the class path that has a class with the same name.
306    * 
307    * This is shamelessly copied from JobConf
308    * 
309    * @param my_class the class to find.
310    * @return a jar file that contains the class, or null.
311    * @throws IOException
312    */
313   private static String findContainingJar(Class my_class) {
314     ClassLoader loader = my_class.getClassLoader();
315     String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
316     try {
317       for(Enumeration itr = loader.getResources(class_file);
318           itr.hasMoreElements();) {
319         URL url = (URL) itr.nextElement();
320         if ("jar".equals(url.getProtocol())) {
321           String toReturn = url.getPath();
322           if (toReturn.startsWith("file:")) {
323             toReturn = toReturn.substring("file:".length());
324           }
325           // URLDecoder is a misnamed class, since it actually decodes
326           // x-www-form-urlencoded MIME type rather than actual
327           // URL encoding (which the file path has). Therefore it would
328           // decode +s to ' 's which is incorrect (spaces are actually
329           // either unencoded or encoded as "%20"). Replace +s first, so
330           // that they are kept sacred during the decoding process.
331           toReturn = toReturn.replaceAll("\\+", "%2B");
332           toReturn = URLDecoder.decode(toReturn, "UTF-8");
333           return toReturn.replaceAll("!.*$", "");
334         }
335       }
336     } catch (IOException e) {
337       throw new RuntimeException(e);
338     }
339     return null;
340   }
341 
342 
343 }