1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.mapreduce;
21
22 import java.io.ByteArrayInputStream;
23 import java.io.ByteArrayOutputStream;
24 import java.io.DataInputStream;
25 import java.io.DataOutputStream;
26 import java.io.IOException;
27 import java.lang.reflect.InvocationTargetException;
28 import java.lang.reflect.Method;
29 import java.net.URL;
30 import java.net.URLDecoder;
31 import java.util.ArrayList;
32 import java.util.Enumeration;
33 import java.util.HashMap;
34 import java.util.HashSet;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Set;
38 import java.util.zip.ZipEntry;
39 import java.util.zip.ZipFile;
40
41 import org.apache.commons.logging.Log;
42 import org.apache.commons.logging.LogFactory;
43 import org.apache.hadoop.conf.Configuration;
44 import org.apache.hadoop.fs.FileSystem;
45 import org.apache.hadoop.fs.Path;
46 import org.apache.hadoop.hbase.HBaseConfiguration;
47 import org.apache.hadoop.hbase.HConstants;
48 import org.apache.hadoop.hbase.client.HTable;
49 import org.apache.hadoop.hbase.client.Scan;
50 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
51 import org.apache.hadoop.hbase.mapreduce.hadoopbackport.JarFinder;
52 import org.apache.hadoop.hbase.security.User;
53 import org.apache.hadoop.hbase.util.Base64;
54 import org.apache.hadoop.hbase.util.Bytes;
55 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
56 import org.apache.hadoop.io.Writable;
57 import org.apache.hadoop.io.WritableComparable;
58 import org.apache.hadoop.mapreduce.InputFormat;
59 import org.apache.hadoop.mapreduce.Job;
60 import org.apache.hadoop.util.StringUtils;
61
62
63
64
65 @SuppressWarnings("unchecked")
66 public class TableMapReduceUtil {
67 static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 public static void initTableMapperJob(String table, Scan scan,
83 Class<? extends TableMapper> mapper,
84 Class<? extends WritableComparable> outputKeyClass,
85 Class<? extends Writable> outputValueClass, Job job)
86 throws IOException {
87 initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
88 job, true);
89 }
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105 public static void initTableMapperJob(byte[] table, Scan scan,
106 Class<? extends TableMapper> mapper,
107 Class<? extends WritableComparable> outputKeyClass,
108 Class<? extends Writable> outputValueClass, Job job)
109 throws IOException {
110 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
111 job, true);
112 }
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129 public static void initTableMapperJob(String table, Scan scan,
130 Class<? extends TableMapper> mapper,
131 Class<? extends WritableComparable> outputKeyClass,
132 Class<? extends Writable> outputValueClass, Job job,
133 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
134 throws IOException {
135 job.setInputFormatClass(inputFormatClass);
136 if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
137 if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
138 job.setMapperClass(mapper);
139 Configuration conf = job.getConfiguration();
140 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
141 conf.set(TableInputFormat.INPUT_TABLE, table);
142 conf.set(TableInputFormat.SCAN, convertScanToString(scan));
143 if (addDependencyJars) {
144 addDependencyJars(job);
145 }
146 initCredentials(job);
147 }
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165 public static void initTableMapperJob(byte[] table, Scan scan,
166 Class<? extends TableMapper> mapper,
167 Class<? extends WritableComparable> outputKeyClass,
168 Class<? extends Writable> outputValueClass, Job job,
169 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
170 throws IOException {
171 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
172 outputValueClass, job, addDependencyJars, inputFormatClass);
173 }
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190 public static void initTableMapperJob(byte[] table, Scan scan,
191 Class<? extends TableMapper> mapper,
192 Class<? extends WritableComparable> outputKeyClass,
193 Class<? extends Writable> outputValueClass, Job job,
194 boolean addDependencyJars)
195 throws IOException {
196 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
197 outputValueClass, job, addDependencyJars, TableInputFormat.class);
198 }
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215 public static void initTableMapperJob(String table, Scan scan,
216 Class<? extends TableMapper> mapper,
217 Class<? extends WritableComparable> outputKeyClass,
218 Class<? extends Writable> outputValueClass, Job job,
219 boolean addDependencyJars)
220 throws IOException {
221 initTableMapperJob(table, scan, mapper, outputKeyClass,
222 outputValueClass, job, addDependencyJars, TableInputFormat.class);
223 }
224
225
226
227
228
229
230
231
232
233
234
235
236
237 public static void initTableMapperJob(List<Scan> scans,
238 Class<? extends TableMapper> mapper,
239 Class<? extends WritableComparable> outputKeyClass,
240 Class<? extends Writable> outputValueClass, Job job) throws IOException {
241 initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
242 true);
243 }
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259 public static void initTableMapperJob(List<Scan> scans,
260 Class<? extends TableMapper> mapper,
261 Class<? extends WritableComparable> outputKeyClass,
262 Class<? extends Writable> outputValueClass, Job job,
263 boolean addDependencyJars) throws IOException {
264 job.setInputFormatClass(MultiTableInputFormat.class);
265 if (outputValueClass != null) {
266 job.setMapOutputValueClass(outputValueClass);
267 }
268 if (outputKeyClass != null) {
269 job.setMapOutputKeyClass(outputKeyClass);
270 }
271 job.setMapperClass(mapper);
272 HBaseConfiguration.addHbaseResources(job.getConfiguration());
273 List<String> scanStrings = new ArrayList<String>();
274
275 for (Scan scan : scans) {
276 scanStrings.add(convertScanToString(scan));
277 }
278 job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
279 scanStrings.toArray(new String[scanStrings.size()]));
280
281 if (addDependencyJars) {
282 addDependencyJars(job);
283 }
284 }
285
286 public static void initCredentials(Job job) throws IOException {
287 if (User.isHBaseSecurityEnabled(job.getConfiguration())) {
288 try {
289
290 String quorumAddress = job.getConfiguration().get(
291 TableOutputFormat.QUORUM_ADDRESS);
292 if (quorumAddress != null) {
293 String[] parts = ZKUtil.transformClusterKey(quorumAddress);
294 Configuration peerConf = HBaseConfiguration.create(job
295 .getConfiguration());
296 peerConf.set(HConstants.ZOOKEEPER_QUORUM, parts[0]);
297 peerConf.set("hbase.zookeeper.client.port", parts[1]);
298 peerConf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, parts[2]);
299 User.getCurrent().obtainAuthTokenForJob(peerConf, job);
300 }
301
302 User.getCurrent().obtainAuthTokenForJob(job.getConfiguration(), job);
303 } catch (InterruptedException ie) {
304 LOG.info("Interrupted obtaining user authentication token");
305 Thread.interrupted();
306 }
307 }
308 }
309
310
311
312
313
314
315
316
317 static String convertScanToString(Scan scan) throws IOException {
318 ByteArrayOutputStream out = new ByteArrayOutputStream();
319 DataOutputStream dos = new DataOutputStream(out);
320 scan.write(dos);
321 return Base64.encodeBytes(out.toByteArray());
322 }
323
324
325
326
327
328
329
330
331 static Scan convertStringToScan(String base64) throws IOException {
332 ByteArrayInputStream bis = new ByteArrayInputStream(Base64.decode(base64));
333 DataInputStream dis = new DataInputStream(bis);
334 Scan scan = new Scan();
335 scan.readFields(dis);
336 return scan;
337 }
338
339
340
341
342
343
344
345
346
347
348 public static void initTableReducerJob(String table,
349 Class<? extends TableReducer> reducer, Job job)
350 throws IOException {
351 initTableReducerJob(table, reducer, job, null);
352 }
353
354
355
356
357
358
359
360
361
362
363
364
365 public static void initTableReducerJob(String table,
366 Class<? extends TableReducer> reducer, Job job,
367 Class partitioner) throws IOException {
368 initTableReducerJob(table, reducer, job, partitioner, null, null, null);
369 }
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394 public static void initTableReducerJob(String table,
395 Class<? extends TableReducer> reducer, Job job,
396 Class partitioner, String quorumAddress, String serverClass,
397 String serverImpl) throws IOException {
398 initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
399 serverClass, serverImpl, true);
400 }
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427 public static void initTableReducerJob(String table,
428 Class<? extends TableReducer> reducer, Job job,
429 Class partitioner, String quorumAddress, String serverClass,
430 String serverImpl, boolean addDependencyJars) throws IOException {
431
432 Configuration conf = job.getConfiguration();
433 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
434 job.setOutputFormatClass(TableOutputFormat.class);
435 if (reducer != null) job.setReducerClass(reducer);
436 conf.set(TableOutputFormat.OUTPUT_TABLE, table);
437
438 if (quorumAddress != null) {
439
440 ZKUtil.transformClusterKey(quorumAddress);
441 conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
442 }
443 if (serverClass != null && serverImpl != null) {
444 conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
445 conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
446 }
447 job.setOutputKeyClass(ImmutableBytesWritable.class);
448 job.setOutputValueClass(Writable.class);
449 if (partitioner == HRegionPartitioner.class) {
450 job.setPartitionerClass(HRegionPartitioner.class);
451 HTable outputTable = new HTable(conf, table);
452 int regions = outputTable.getRegionsInfo().size();
453 if (job.getNumReduceTasks() > regions) {
454 job.setNumReduceTasks(outputTable.getRegionsInfo().size());
455 }
456 } else if (partitioner != null) {
457 job.setPartitionerClass(partitioner);
458 }
459
460 if (addDependencyJars) {
461 addDependencyJars(job);
462 }
463
464 initCredentials(job);
465 }
466
467
468
469
470
471
472
473
474
475 public static void limitNumReduceTasks(String table, Job job)
476 throws IOException {
477 HTable outputTable = new HTable(job.getConfiguration(), table);
478 int regions = outputTable.getRegionsInfo().size();
479 if (job.getNumReduceTasks() > regions)
480 job.setNumReduceTasks(regions);
481 }
482
483
484
485
486
487
488
489
490
491 public static void setNumReduceTasks(String table, Job job)
492 throws IOException {
493 HTable outputTable = new HTable(job.getConfiguration(), table);
494 int regions = outputTable.getRegionsInfo().size();
495 job.setNumReduceTasks(regions);
496 }
497
498
499
500
501
502
503
504
505
506
507 public static void setScannerCaching(Job job, int batchSize) {
508 job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
509 }
510
511
512
513
514
515
516 public static void addDependencyJars(Job job) throws IOException {
517 try {
518 addDependencyJars(job.getConfiguration(),
519 org.apache.zookeeper.ZooKeeper.class,
520 com.google.protobuf.Message.class,
521 com.google.common.collect.ImmutableSet.class,
522 org.apache.hadoop.hbase.util.Bytes.class,
523 job.getMapOutputKeyClass(),
524 job.getMapOutputValueClass(),
525 job.getInputFormatClass(),
526 job.getOutputKeyClass(),
527 job.getOutputValueClass(),
528 job.getOutputFormatClass(),
529 job.getPartitionerClass(),
530 job.getCombinerClass());
531 } catch (ClassNotFoundException e) {
532 throw new IOException(e);
533 }
534 }
535
536
537
538
539
540
541 public static void addDependencyJars(Configuration conf,
542 Class<?>... classes) throws IOException {
543
544 FileSystem localFs = FileSystem.getLocal(conf);
545 Set<String> jars = new HashSet<String>();
546
547 jars.addAll(conf.getStringCollection("tmpjars"));
548
549
550
551 Map<String, String> packagedClasses = new HashMap<String, String>();
552
553
554 for (Class<?> clazz : classes) {
555 if (clazz == null) continue;
556
557 Path path = findOrCreateJar(clazz, localFs, packagedClasses);
558 if (path == null) {
559 LOG.warn("Could not find jar for class " + clazz +
560 " in order to ship it to the cluster.");
561 continue;
562 }
563 if (!localFs.exists(path)) {
564 LOG.warn("Could not validate jar file " + path + " for class "
565 + clazz);
566 continue;
567 }
568 jars.add(path.toString());
569 }
570 if (jars.isEmpty()) return;
571
572 conf.set("tmpjars",
573 StringUtils.arrayToString(jars.toArray(new String[0])));
574 }
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590 private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
591 Map<String, String> packagedClasses)
592 throws IOException {
593
594 String jar = findContainingJar(my_class, packagedClasses);
595 if (null == jar || jar.isEmpty()) {
596 jar = getJar(my_class);
597 updateMap(jar, packagedClasses);
598 }
599
600 if (null == jar || jar.isEmpty()) {
601 throw new IOException("Cannot locate resource for class " + my_class.getName());
602 }
603
604 LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
605 return new Path(jar).makeQualified(fs);
606 }
607
608
609
610
611
612
613
614 private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
615 ZipFile zip = null;
616 try {
617 zip = new ZipFile(jar);
618 for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
619 ZipEntry entry = iter.nextElement();
620 if (entry.getName().endsWith("class")) {
621 packagedClasses.put(entry.getName(), jar);
622 }
623 }
624 } finally {
625 if (null != zip) zip.close();
626 }
627 }
628
629
630
631
632
633
634
635
636
637
638 private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
639 throws IOException {
640 ClassLoader loader = my_class.getClassLoader();
641 String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
642
643
644 for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
645 URL url = itr.nextElement();
646 if ("jar".equals(url.getProtocol())) {
647 String toReturn = url.getPath();
648 if (toReturn.startsWith("file:")) {
649 toReturn = toReturn.substring("file:".length());
650 }
651
652
653
654
655
656
657 toReturn = toReturn.replaceAll("\\+", "%2B");
658 toReturn = URLDecoder.decode(toReturn, "UTF-8");
659 return toReturn.replaceAll("!.*$", "");
660 }
661 }
662
663
664
665 return packagedClasses.get(class_file);
666 }
667
668
669
670
671
672
673
674
675 private static String getJar(Class<?> my_class) {
676 String ret = null;
677 String hadoopJarFinder = "org.apache.hadoop.util.JarFinder";
678 Class<?> jarFinder = null;
679 try {
680 LOG.debug("Looking for " + hadoopJarFinder + ".");
681 jarFinder = Class.forName(hadoopJarFinder);
682 LOG.debug(hadoopJarFinder + " found.");
683 Method getJar = jarFinder.getMethod("getJar", Class.class);
684 ret = (String) getJar.invoke(null, my_class);
685 } catch (ClassNotFoundException e) {
686 LOG.debug("Using backported JarFinder.");
687 ret = JarFinder.getJar(my_class);
688 } catch (InvocationTargetException e) {
689
690
691 throw new RuntimeException(e.getCause());
692 } catch (Exception e) {
693
694 throw new RuntimeException("getJar invocation failed.", e);
695 }
696
697 return ret;
698 }
699 }