1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.IOException;
22 import java.lang.reflect.InvocationTargetException;
23 import java.lang.reflect.Method;
24 import java.net.URL;
25 import java.net.URLDecoder;
26 import java.util.ArrayList;
27 import java.util.Enumeration;
28 import java.util.HashMap;
29 import java.util.HashSet;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.Set;
33 import java.util.zip.ZipEntry;
34 import java.util.zip.ZipFile;
35
36 import org.apache.commons.logging.Log;
37 import org.apache.commons.logging.LogFactory;
38 import org.apache.hadoop.classification.InterfaceAudience;
39 import org.apache.hadoop.classification.InterfaceStability;
40 import org.apache.hadoop.conf.Configuration;
41 import org.apache.hadoop.fs.FileSystem;
42 import org.apache.hadoop.fs.Path;
43 import org.apache.hadoop.hbase.HBaseConfiguration;
44 import org.apache.hadoop.hbase.catalog.MetaReader;
45 import org.apache.hadoop.hbase.client.Put;
46 import org.apache.hadoop.hbase.client.Scan;
47 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
48 import org.apache.hadoop.hbase.mapreduce.hadoopbackport.JarFinder;
49 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
50 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
51 import org.apache.hadoop.hbase.security.User;
52 import org.apache.hadoop.hbase.util.Base64;
53 import org.apache.hadoop.hbase.util.Bytes;
54 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
55 import org.apache.hadoop.io.Writable;
56 import org.apache.hadoop.io.WritableComparable;
57 import org.apache.hadoop.mapreduce.InputFormat;
58 import org.apache.hadoop.mapreduce.Job;
59 import org.apache.hadoop.util.StringUtils;
60
61 import com.google.protobuf.InvalidProtocolBufferException;
62
63
64
65
66 @SuppressWarnings("unchecked")
67 @InterfaceAudience.Public
68 @InterfaceStability.Stable
69 public class TableMapReduceUtil {
70 static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 public static void initTableMapperJob(String table, Scan scan,
86 Class<? extends TableMapper> mapper,
87 Class<?> outputKeyClass,
88 Class<?> outputValueClass, Job job)
89 throws IOException {
90 initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
91 job, true);
92 }
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 public static void initTableMapperJob(byte[] table, Scan scan,
109 Class<? extends TableMapper> mapper,
110 Class<?> outputKeyClass,
111 Class<?> outputValueClass, Job job)
112 throws IOException {
113 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
114 job, true);
115 }
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132 public static void initTableMapperJob(String table, Scan scan,
133 Class<? extends TableMapper> mapper,
134 Class<?> outputKeyClass,
135 Class<?> outputValueClass, Job job,
136 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
137 throws IOException {
138 job.setInputFormatClass(inputFormatClass);
139 if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
140 if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
141 job.setMapperClass(mapper);
142 if (Put.class.equals(outputValueClass)) {
143 job.setCombinerClass(PutCombiner.class);
144 }
145 Configuration conf = job.getConfiguration();
146 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
147 conf.set(TableInputFormat.INPUT_TABLE, table);
148 conf.set(TableInputFormat.SCAN, convertScanToString(scan));
149 conf.setStrings("io.serializations", conf.get("io.serializations"),
150 MutationSerialization.class.getName(), ResultSerialization.class.getName(),
151 KeyValueSerialization.class.getName());
152 if (addDependencyJars) {
153 addDependencyJars(job);
154 }
155 initCredentials(job);
156 }
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174 public static void initTableMapperJob(byte[] table, Scan scan,
175 Class<? extends TableMapper> mapper,
176 Class<?> outputKeyClass,
177 Class<?> outputValueClass, Job job,
178 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
179 throws IOException {
180 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
181 outputValueClass, job, addDependencyJars, inputFormatClass);
182 }
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199 public static void initTableMapperJob(byte[] table, Scan scan,
200 Class<? extends TableMapper> mapper,
201 Class<?> outputKeyClass,
202 Class<?> outputValueClass, Job job,
203 boolean addDependencyJars)
204 throws IOException {
205 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
206 outputValueClass, job, addDependencyJars, TableInputFormat.class);
207 }
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224 public static void initTableMapperJob(String table, Scan scan,
225 Class<? extends TableMapper> mapper,
226 Class<?> outputKeyClass,
227 Class<?> outputValueClass, Job job,
228 boolean addDependencyJars)
229 throws IOException {
230 initTableMapperJob(table, scan, mapper, outputKeyClass,
231 outputValueClass, job, addDependencyJars, TableInputFormat.class);
232 }
233
234
235
236
237
238
239
240
241
242
243
244
245
246 public static void initTableMapperJob(List<Scan> scans,
247 Class<? extends TableMapper> mapper,
248 Class<? extends WritableComparable> outputKeyClass,
249 Class<? extends Writable> outputValueClass, Job job) throws IOException {
250 initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
251 true);
252 }
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268 public static void initTableMapperJob(List<Scan> scans,
269 Class<? extends TableMapper> mapper,
270 Class<? extends WritableComparable> outputKeyClass,
271 Class<? extends Writable> outputValueClass, Job job,
272 boolean addDependencyJars) throws IOException {
273 job.setInputFormatClass(MultiTableInputFormat.class);
274 if (outputValueClass != null) {
275 job.setMapOutputValueClass(outputValueClass);
276 }
277 if (outputKeyClass != null) {
278 job.setMapOutputKeyClass(outputKeyClass);
279 }
280 job.setMapperClass(mapper);
281 HBaseConfiguration.addHbaseResources(job.getConfiguration());
282 List<String> scanStrings = new ArrayList<String>();
283
284 for (Scan scan : scans) {
285 scanStrings.add(convertScanToString(scan));
286 }
287 job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
288 scanStrings.toArray(new String[scanStrings.size()]));
289
290 if (addDependencyJars) {
291 addDependencyJars(job);
292 }
293 }
294
295 public static void initCredentials(Job job) throws IOException {
296 if (User.isHBaseSecurityEnabled(job.getConfiguration())) {
297 try {
298
299 String quorumAddress = job.getConfiguration().get(TableOutputFormat.QUORUM_ADDRESS);
300 if (quorumAddress != null) {
301 Configuration peerConf = HBaseConfiguration.create(job.getConfiguration());
302 ZKUtil.applyClusterKeyToConf(peerConf, quorumAddress);
303 User.getCurrent().obtainAuthTokenForJob(peerConf, job);
304 }
305 User.getCurrent().obtainAuthTokenForJob(job.getConfiguration(), job);
306 } catch (InterruptedException ie) {
307 LOG.info("Interrupted obtaining user authentication token");
308 Thread.interrupted();
309 }
310 }
311 }
312
313
314
315
316
317
318
319
320 static String convertScanToString(Scan scan) throws IOException {
321 ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
322 return Base64.encodeBytes(proto.toByteArray());
323 }
324
325
326
327
328
329
330
331
332 static Scan convertStringToScan(String base64) throws IOException {
333 byte [] decoded = Base64.decode(base64);
334 ClientProtos.Scan scan;
335 try {
336 scan = ClientProtos.Scan.parseFrom(decoded);
337 } catch (InvalidProtocolBufferException ipbe) {
338 throw new IOException(ipbe);
339 }
340
341 return ProtobufUtil.toScan(scan);
342 }
343
344
345
346
347
348
349
350
351
352
353 public static void initTableReducerJob(String table,
354 Class<? extends TableReducer> reducer, Job job)
355 throws IOException {
356 initTableReducerJob(table, reducer, job, null);
357 }
358
359
360
361
362
363
364
365
366
367
368
369
370 public static void initTableReducerJob(String table,
371 Class<? extends TableReducer> reducer, Job job,
372 Class partitioner) throws IOException {
373 initTableReducerJob(table, reducer, job, partitioner, null, null, null);
374 }
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399 public static void initTableReducerJob(String table,
400 Class<? extends TableReducer> reducer, Job job,
401 Class partitioner, String quorumAddress, String serverClass,
402 String serverImpl) throws IOException {
403 initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
404 serverClass, serverImpl, true);
405 }
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432 public static void initTableReducerJob(String table,
433 Class<? extends TableReducer> reducer, Job job,
434 Class partitioner, String quorumAddress, String serverClass,
435 String serverImpl, boolean addDependencyJars) throws IOException {
436
437 Configuration conf = job.getConfiguration();
438 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
439 job.setOutputFormatClass(TableOutputFormat.class);
440 if (reducer != null) job.setReducerClass(reducer);
441 conf.set(TableOutputFormat.OUTPUT_TABLE, table);
442 conf.setStrings("io.serializations", conf.get("io.serializations"),
443 MutationSerialization.class.getName(), ResultSerialization.class.getName());
444
445 if (quorumAddress != null) {
446
447 ZKUtil.transformClusterKey(quorumAddress);
448 conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
449 }
450 if (serverClass != null && serverImpl != null) {
451 conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
452 conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
453 }
454 job.setOutputKeyClass(ImmutableBytesWritable.class);
455 job.setOutputValueClass(Writable.class);
456 if (partitioner == HRegionPartitioner.class) {
457 job.setPartitionerClass(HRegionPartitioner.class);
458 int regions = MetaReader.getRegionCount(conf, table);
459 if (job.getNumReduceTasks() > regions) {
460 job.setNumReduceTasks(regions);
461 }
462 } else if (partitioner != null) {
463 job.setPartitionerClass(partitioner);
464 }
465
466 if (addDependencyJars) {
467 addDependencyJars(job);
468 }
469
470 initCredentials(job);
471 }
472
473
474
475
476
477
478
479
480
481 public static void limitNumReduceTasks(String table, Job job)
482 throws IOException {
483 int regions = MetaReader.getRegionCount(job.getConfiguration(), table);
484 if (job.getNumReduceTasks() > regions)
485 job.setNumReduceTasks(regions);
486 }
487
488
489
490
491
492
493
494
495
496 public static void setNumReduceTasks(String table, Job job)
497 throws IOException {
498 job.setNumReduceTasks(MetaReader.getRegionCount(job.getConfiguration(), table));
499 }
500
501
502
503
504
505
506
507
508
509
510 public static void setScannerCaching(Job job, int batchSize) {
511 job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
512 }
513
514
515
516
517
518
519 public static void addDependencyJars(Job job) throws IOException {
520 try {
521 addDependencyJars(job.getConfiguration(),
522
523 org.apache.hadoop.hbase.HConstants.class,
524 org.apache.hadoop.hbase.protobuf.generated.ClientProtos.class,
525 org.apache.hadoop.hbase.client.Put.class,
526 org.apache.hadoop.hbase.CompatibilityFactory.class,
527
528 org.apache.zookeeper.ZooKeeper.class,
529 com.google.protobuf.Message.class,
530 com.google.common.collect.Lists.class,
531 org.cloudera.htrace.Trace.class,
532
533 job.getMapOutputKeyClass(),
534 job.getMapOutputValueClass(),
535 job.getInputFormatClass(),
536 job.getOutputKeyClass(),
537 job.getOutputValueClass(),
538 job.getOutputFormatClass(),
539 job.getPartitionerClass(),
540 job.getCombinerClass());
541 } catch (ClassNotFoundException e) {
542 throw new IOException(e);
543 }
544 }
545
546
547
548
549
550
551 public static void addDependencyJars(Configuration conf,
552 Class<?>... classes) throws IOException {
553
554 FileSystem localFs = FileSystem.getLocal(conf);
555 Set<String> jars = new HashSet<String>();
556
557 jars.addAll(conf.getStringCollection("tmpjars"));
558
559
560
561 Map<String, String> packagedClasses = new HashMap<String, String>();
562
563
564 for (Class<?> clazz : classes) {
565 if (clazz == null) continue;
566
567 Path path = findOrCreateJar(clazz, localFs, packagedClasses);
568 if (path == null) {
569 LOG.warn("Could not find jar for class " + clazz +
570 " in order to ship it to the cluster.");
571 continue;
572 }
573 if (!localFs.exists(path)) {
574 LOG.warn("Could not validate jar file " + path + " for class "
575 + clazz);
576 continue;
577 }
578 jars.add(path.toString());
579 }
580 if (jars.isEmpty()) return;
581
582 conf.set("tmpjars",
583 StringUtils.arrayToString(jars.toArray(new String[0])));
584 }
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600 private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
601 Map<String, String> packagedClasses)
602 throws IOException {
603
604 String jar = findContainingJar(my_class, packagedClasses);
605 if (null == jar || jar.isEmpty()) {
606 jar = getJar(my_class);
607 updateMap(jar, packagedClasses);
608 }
609
610 if (null == jar || jar.isEmpty()) {
611 throw new IOException("Cannot locate resource for class " + my_class.getName());
612 }
613
614 LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
615 return new Path(jar).makeQualified(fs);
616 }
617
618
619
620
621
622
623
624 private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
625 ZipFile zip = null;
626 try {
627 zip = new ZipFile(jar);
628 for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
629 ZipEntry entry = iter.nextElement();
630 if (entry.getName().endsWith("class")) {
631 packagedClasses.put(entry.getName(), jar);
632 }
633 }
634 } finally {
635 if (null != zip) zip.close();
636 }
637 }
638
639
640
641
642
643
644
645
646
647
648 private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
649 throws IOException {
650 ClassLoader loader = my_class.getClassLoader();
651 String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
652
653
654 for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
655 URL url = itr.nextElement();
656 if ("jar".equals(url.getProtocol())) {
657 String toReturn = url.getPath();
658 if (toReturn.startsWith("file:")) {
659 toReturn = toReturn.substring("file:".length());
660 }
661
662
663
664
665
666
667 toReturn = toReturn.replaceAll("\\+", "%2B");
668 toReturn = URLDecoder.decode(toReturn, "UTF-8");
669 return toReturn.replaceAll("!.*$", "");
670 }
671 }
672
673
674
675 return packagedClasses.get(class_file);
676 }
677
678
679
680
681
682
683
684
685 private static String getJar(Class<?> my_class) {
686 String ret = null;
687 String hadoopJarFinder = "org.apache.hadoop.util.JarFinder";
688 Class<?> jarFinder = null;
689 try {
690 LOG.debug("Looking for " + hadoopJarFinder + ".");
691 jarFinder = Class.forName(hadoopJarFinder);
692 LOG.debug(hadoopJarFinder + " found.");
693 Method getJar = jarFinder.getMethod("getJar", Class.class);
694 ret = (String) getJar.invoke(null, my_class);
695 } catch (ClassNotFoundException e) {
696 LOG.debug("Using backported JarFinder.");
697 ret = JarFinder.getJar(my_class);
698 } catch (InvocationTargetException e) {
699
700
701 throw new RuntimeException(e.getCause());
702 } catch (Exception e) {
703
704 throw new RuntimeException("getJar invocation failed.", e);
705 }
706
707 return ret;
708 }
709 }