View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapred;
20  
21  import java.io.IOException;
22  
23  import org.apache.hadoop.conf.Configuration;
24  import org.apache.hadoop.classification.InterfaceAudience;
25  import org.apache.hadoop.classification.InterfaceStability;
26  import org.apache.hadoop.fs.Path;
27  import org.apache.hadoop.hbase.HBaseConfiguration;
28  import org.apache.hadoop.hbase.catalog.MetaReader;
29  import org.apache.hadoop.hbase.client.Put;
30  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
31  import org.apache.hadoop.hbase.mapreduce.MutationSerialization;
32  import org.apache.hadoop.hbase.mapreduce.ResultSerialization;
33  import org.apache.hadoop.hbase.security.token.AuthenticationTokenIdentifier;
34  import org.apache.hadoop.hbase.security.token.AuthenticationTokenSelector;
35  import org.apache.hadoop.hbase.security.User;
36  import org.apache.hadoop.hbase.security.UserProvider;
37  import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
38  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
39  import org.apache.hadoop.io.Text;
40  import org.apache.hadoop.mapred.FileInputFormat;
41  import org.apache.hadoop.mapred.InputFormat;
42  import org.apache.hadoop.mapred.JobConf;
43  import org.apache.hadoop.mapred.OutputFormat;
44  import org.apache.hadoop.mapred.TextInputFormat;
45  import org.apache.hadoop.mapred.TextOutputFormat;
46  import org.apache.hadoop.security.token.Token;
47  import org.apache.zookeeper.KeeperException;
48  import org.cliffc.high_scale_lib.Counter;
49  
50  /**
51   * Utility for {@link TableMap} and {@link TableReduce}
52   */
53  @Deprecated
54  @InterfaceAudience.Public
55  @InterfaceStability.Stable
56  @SuppressWarnings({ "rawtypes", "unchecked" })
57  public class TableMapReduceUtil {
58  
59    /**
60     * Use this before submitting a TableMap job. It will
61     * appropriately set up the JobConf.
62     *
63     * @param table  The table name to read from.
64     * @param columns  The columns to scan.
65     * @param mapper  The mapper class to use.
66     * @param outputKeyClass  The class of the output key.
67     * @param outputValueClass  The class of the output value.
68     * @param job  The current job configuration to adjust.
69     */
70    public static void initTableMapJob(String table, String columns,
71      Class<? extends TableMap> mapper,
72      Class<?> outputKeyClass,
73      Class<?> outputValueClass, JobConf job) {
74      initTableMapJob(table, columns, mapper, outputKeyClass, outputValueClass, job,
75        true, TableInputFormat.class);
76    }
77  
78    public static void initTableMapJob(String table, String columns,
79      Class<? extends TableMap> mapper,
80      Class<?> outputKeyClass,
81      Class<?> outputValueClass, JobConf job, boolean addDependencyJars) {
82      initTableMapJob(table, columns, mapper, outputKeyClass, outputValueClass, job,
83        addDependencyJars, TableInputFormat.class);
84    }
85  
86    /**
87     * Use this before submitting a TableMap job. It will
88     * appropriately set up the JobConf.
89     *
90     * @param table  The table name to read from.
91     * @param columns  The columns to scan.
92     * @param mapper  The mapper class to use.
93     * @param outputKeyClass  The class of the output key.
94     * @param outputValueClass  The class of the output value.
95     * @param job  The current job configuration to adjust.
96     * @param addDependencyJars upload HBase jars and jars for any of the configured
97     *           job classes via the distributed cache (tmpjars).
98     */
99    public static void initTableMapJob(String table, String columns,
100     Class<? extends TableMap> mapper,
101     Class<?> outputKeyClass,
102     Class<?> outputValueClass, JobConf job, boolean addDependencyJars,
103     Class<? extends InputFormat> inputFormat) {
104 
105     job.setInputFormat(inputFormat);
106     job.setMapOutputValueClass(outputValueClass);
107     job.setMapOutputKeyClass(outputKeyClass);
108     job.setMapperClass(mapper);
109     job.setStrings("io.serializations", job.get("io.serializations"),
110         MutationSerialization.class.getName(), ResultSerialization.class.getName());
111     FileInputFormat.addInputPaths(job, table);
112     job.set(TableInputFormat.COLUMN_LIST, columns);
113     if (addDependencyJars) {
114       try {
115         addDependencyJars(job);
116       } catch (IOException e) {
117         e.printStackTrace();
118       }
119     }
120     try {
121       initCredentials(job);
122     } catch (IOException ioe) {
123       // just spit out the stack trace?  really?
124       ioe.printStackTrace();
125     }
126   }
127 
128   /**
129    * Sets up the job for reading from a table snapshot. It bypasses hbase servers
130    * and read directly from snapshot files.
131    *
132    * @param snapshotName The name of the snapshot (of a table) to read from.
133    * @param columns  The columns to scan.
134    * @param mapper  The mapper class to use.
135    * @param outputKeyClass  The class of the output key.
136    * @param outputValueClass  The class of the output value.
137    * @param job  The current job to adjust.  Make sure the passed job is
138    * carrying all necessary HBase configuration.
139    * @param addDependencyJars upload HBase jars and jars for any of the configured
140    *           job classes via the distributed cache (tmpjars).
141    * @param tmpRestoreDir a temporary directory to copy the snapshot files into. Current user should
142    * have write permissions to this directory, and this should not be a subdirectory of rootdir.
143    * After the job is finished, restore directory can be deleted.
144    * @throws IOException When setting up the details fails.
145    * @see TableSnapshotInputFormat
146    */
147   public static void initTableSnapshotMapJob(String snapshotName, String columns,
148       Class<? extends TableMap> mapper,
149       Class<?> outputKeyClass,
150       Class<?> outputValueClass, JobConf job,
151       boolean addDependencyJars, Path tmpRestoreDir)
152   throws IOException {
153     TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir);
154     initTableMapJob(snapshotName, columns, mapper, outputKeyClass, outputValueClass, job,
155       addDependencyJars, TableSnapshotInputFormat.class);
156     org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.resetCacheConfig(job);
157   }
158 
159   /**
160    * Use this before submitting a TableReduce job. It will
161    * appropriately set up the JobConf.
162    *
163    * @param table  The output table.
164    * @param reducer  The reducer class to use.
165    * @param job  The current job configuration to adjust.
166    * @throws IOException When determining the region count fails.
167    */
168   public static void initTableReduceJob(String table,
169     Class<? extends TableReduce> reducer, JobConf job)
170   throws IOException {
171     initTableReduceJob(table, reducer, job, null);
172   }
173 
174   /**
175    * Use this before submitting a TableReduce job. It will
176    * appropriately set up the JobConf.
177    *
178    * @param table  The output table.
179    * @param reducer  The reducer class to use.
180    * @param job  The current job configuration to adjust.
181    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
182    * default partitioner.
183    * @throws IOException When determining the region count fails.
184    */
185   public static void initTableReduceJob(String table,
186     Class<? extends TableReduce> reducer, JobConf job, Class partitioner)
187   throws IOException {
188     initTableReduceJob(table, reducer, job, partitioner, true);
189   }
190 
191   /**
192    * Use this before submitting a TableReduce job. It will
193    * appropriately set up the JobConf.
194    *
195    * @param table  The output table.
196    * @param reducer  The reducer class to use.
197    * @param job  The current job configuration to adjust.
198    * @param partitioner  Partitioner to use. Pass <code>null</code> to use
199    * default partitioner.
200    * @param addDependencyJars upload HBase jars and jars for any of the configured
201    *           job classes via the distributed cache (tmpjars).
202    * @throws IOException When determining the region count fails.
203    */
204   public static void initTableReduceJob(String table,
205     Class<? extends TableReduce> reducer, JobConf job, Class partitioner,
206     boolean addDependencyJars) throws IOException {
207     job.setOutputFormat(TableOutputFormat.class);
208     job.setReducerClass(reducer);
209     job.set(TableOutputFormat.OUTPUT_TABLE, table);
210     job.setOutputKeyClass(ImmutableBytesWritable.class);
211     job.setOutputValueClass(Put.class);
212     job.setStrings("io.serializations", job.get("io.serializations"),
213         MutationSerialization.class.getName(), ResultSerialization.class.getName());
214     if (partitioner == HRegionPartitioner.class) {
215       job.setPartitionerClass(HRegionPartitioner.class);
216       int regions = MetaReader.getRegionCount(HBaseConfiguration.create(job), table);
217       if (job.getNumReduceTasks() > regions) {
218         job.setNumReduceTasks(regions);
219       }
220     } else if (partitioner != null) {
221       job.setPartitionerClass(partitioner);
222     }
223     if (addDependencyJars) {
224       addDependencyJars(job);
225     }
226     initCredentials(job);
227   }
228 
229   public static void initCredentials(JobConf job) throws IOException {
230     UserProvider userProvider = UserProvider.instantiate(job);
231     if (userProvider.isHadoopSecurityEnabled()) {
232       // propagate delegation related props from launcher job to MR job
233       if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
234         job.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
235       }
236     }
237 
238     if (userProvider.isHBaseSecurityEnabled()) {
239       try {
240         // login the server principal (if using secure Hadoop)
241         User user = userProvider.getCurrent();
242         Token<AuthenticationTokenIdentifier> authToken = getAuthToken(job, user);
243         if (authToken == null) {
244           user.obtainAuthTokenForJob(job);
245         } else {
246           job.getCredentials().addToken(authToken.getService(), authToken);
247         }
248       } catch (InterruptedException ie) {
249         ie.printStackTrace();
250         Thread.currentThread().interrupt();
251       }
252     }
253   }
254 
255   /**
256    * Get the authentication token of the user for the cluster specified in the configuration
257    * @return null if the user does not have the token, otherwise the auth token for the cluster.
258    */
259   private static Token<AuthenticationTokenIdentifier> getAuthToken(Configuration conf, User user)
260       throws IOException, InterruptedException {
261     ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "mr-init-credentials", null);
262     try {
263       String clusterId = ZKClusterId.readClusterIdZNode(zkw);
264       return new AuthenticationTokenSelector().selectToken(new Text(clusterId), user.getUGI().getTokens());
265     } catch (KeeperException e) {
266       throw new IOException(e);
267     } finally {
268       zkw.close();
269     }
270   }
271 
272   /**
273    * Ensures that the given number of reduce tasks for the given job
274    * configuration does not exceed the number of regions for the given table.
275    *
276    * @param table  The table to get the region count for.
277    * @param job  The current job configuration to adjust.
278    * @throws IOException When retrieving the table details fails.
279    */
280   public static void limitNumReduceTasks(String table, JobConf job)
281   throws IOException {
282     int regions = MetaReader.getRegionCount(HBaseConfiguration.create(job), table);
283     if (job.getNumReduceTasks() > regions)
284       job.setNumReduceTasks(regions);
285   }
286 
287   /**
288    * Ensures that the given number of map tasks for the given job
289    * configuration does not exceed the number of regions for the given table.
290    *
291    * @param table  The table to get the region count for.
292    * @param job  The current job configuration to adjust.
293    * @throws IOException When retrieving the table details fails.
294    */
295   public static void limitNumMapTasks(String table, JobConf job)
296   throws IOException {
297     int regions = MetaReader.getRegionCount(HBaseConfiguration.create(job), table);
298     if (job.getNumMapTasks() > regions)
299       job.setNumMapTasks(regions);
300   }
301 
302   /**
303    * Sets the number of reduce tasks for the given job configuration to the
304    * number of regions the given table has.
305    *
306    * @param table  The table to get the region count for.
307    * @param job  The current job configuration to adjust.
308    * @throws IOException When retrieving the table details fails.
309    */
310   public static void setNumReduceTasks(String table, JobConf job)
311   throws IOException {
312     job.setNumReduceTasks(MetaReader.getRegionCount(HBaseConfiguration.create(job), table));
313   }
314 
315   /**
316    * Sets the number of map tasks for the given job configuration to the
317    * number of regions the given table has.
318    *
319    * @param table  The table to get the region count for.
320    * @param job  The current job configuration to adjust.
321    * @throws IOException When retrieving the table details fails.
322    */
323   public static void setNumMapTasks(String table, JobConf job)
324   throws IOException {
325     job.setNumMapTasks(MetaReader.getRegionCount(HBaseConfiguration.create(job), table));
326   }
327 
328   /**
329    * Sets the number of rows to return and cache with each scanner iteration.
330    * Higher caching values will enable faster mapreduce jobs at the expense of
331    * requiring more heap to contain the cached rows.
332    *
333    * @param job The current job configuration to adjust.
334    * @param batchSize The number of rows to return in batch with each scanner
335    * iteration.
336    */
337   public static void setScannerCaching(JobConf job, int batchSize) {
338     job.setInt("hbase.client.scanner.caching", batchSize);
339   }
340 
341   /**
342    * @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
343    */
344   public static void addDependencyJars(JobConf job) throws IOException {
345     org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
346     org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars(
347       job,
348       // when making changes here, consider also mapreduce.TableMapReduceUtil
349       // pull job classes
350       job.getMapOutputKeyClass(),
351       job.getMapOutputValueClass(),
352       job.getOutputKeyClass(),
353       job.getOutputValueClass(),
354       job.getPartitionerClass(),
355       job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
356       job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
357       job.getCombinerClass());
358   }
359 }