1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.mapreduce;
21
22 import java.io.ByteArrayInputStream;
23 import java.io.ByteArrayOutputStream;
24 import java.io.DataInputStream;
25 import java.io.DataOutputStream;
26 import java.io.IOException;
27 import java.lang.reflect.InvocationTargetException;
28 import java.lang.reflect.Method;
29 import java.net.URL;
30 import java.net.URLDecoder;
31 import java.util.ArrayList;
32 import java.util.Enumeration;
33 import java.util.HashMap;
34 import java.util.HashSet;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.Set;
38 import java.util.zip.ZipEntry;
39 import java.util.zip.ZipFile;
40
41 import org.apache.commons.logging.Log;
42 import org.apache.commons.logging.LogFactory;
43 import org.apache.hadoop.conf.Configuration;
44 import org.apache.hadoop.fs.FileSystem;
45 import org.apache.hadoop.fs.Path;
46 import org.apache.hadoop.hbase.HBaseConfiguration;
47 import org.apache.hadoop.hbase.HConstants;
48 import org.apache.hadoop.hbase.client.HTable;
49 import org.apache.hadoop.hbase.client.Scan;
50 import org.apache.hadoop.hbase.client.UserProvider;
51 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
52 import org.apache.hadoop.hbase.mapreduce.hadoopbackport.JarFinder;
53 import org.apache.hadoop.hbase.security.User;
54 import org.apache.hadoop.hbase.util.Base64;
55 import org.apache.hadoop.hbase.util.Bytes;
56 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
57 import org.apache.hadoop.io.Writable;
58 import org.apache.hadoop.io.WritableComparable;
59 import org.apache.hadoop.mapreduce.InputFormat;
60 import org.apache.hadoop.mapreduce.Job;
61 import org.apache.hadoop.util.StringUtils;
62
63
64
65
66 @SuppressWarnings("unchecked")
67 public class TableMapReduceUtil {
68 static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83 public static void initTableMapperJob(String table, Scan scan,
84 Class<? extends TableMapper> mapper,
85 Class<? extends WritableComparable> outputKeyClass,
86 Class<? extends Writable> outputValueClass, Job job)
87 throws IOException {
88 initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
89 job, true);
90 }
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106 public static void initTableMapperJob(byte[] table, Scan scan,
107 Class<? extends TableMapper> mapper,
108 Class<? extends WritableComparable> outputKeyClass,
109 Class<? extends Writable> outputValueClass, Job job)
110 throws IOException {
111 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
112 job, true);
113 }
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130 public static void initTableMapperJob(String table, Scan scan,
131 Class<? extends TableMapper> mapper,
132 Class<? extends WritableComparable> outputKeyClass,
133 Class<? extends Writable> outputValueClass, Job job,
134 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
135 throws IOException {
136 job.setInputFormatClass(inputFormatClass);
137 if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
138 if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
139 job.setMapperClass(mapper);
140 Configuration conf = job.getConfiguration();
141 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
142 conf.set(TableInputFormat.INPUT_TABLE, table);
143 conf.set(TableInputFormat.SCAN, convertScanToString(scan));
144 if (addDependencyJars) {
145 addDependencyJars(job);
146 }
147 initCredentials(job);
148 }
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166 public static void initTableMapperJob(byte[] table, Scan scan,
167 Class<? extends TableMapper> mapper,
168 Class<? extends WritableComparable> outputKeyClass,
169 Class<? extends Writable> outputValueClass, Job job,
170 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
171 throws IOException {
172 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
173 outputValueClass, job, addDependencyJars, inputFormatClass);
174 }
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191 public static void initTableMapperJob(byte[] table, Scan scan,
192 Class<? extends TableMapper> mapper,
193 Class<? extends WritableComparable> outputKeyClass,
194 Class<? extends Writable> outputValueClass, Job job,
195 boolean addDependencyJars)
196 throws IOException {
197 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
198 outputValueClass, job, addDependencyJars, TableInputFormat.class);
199 }
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216 public static void initTableMapperJob(String table, Scan scan,
217 Class<? extends TableMapper> mapper,
218 Class<? extends WritableComparable> outputKeyClass,
219 Class<? extends Writable> outputValueClass, Job job,
220 boolean addDependencyJars)
221 throws IOException {
222 initTableMapperJob(table, scan, mapper, outputKeyClass,
223 outputValueClass, job, addDependencyJars, TableInputFormat.class);
224 }
225
226
227
228
229
230
231
232
233
234
235
236
237
238 public static void initTableMapperJob(List<Scan> scans,
239 Class<? extends TableMapper> mapper,
240 Class<? extends WritableComparable> outputKeyClass,
241 Class<? extends Writable> outputValueClass, Job job) throws IOException {
242 initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
243 true);
244 }
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260 public static void initTableMapperJob(List<Scan> scans,
261 Class<? extends TableMapper> mapper,
262 Class<? extends WritableComparable> outputKeyClass,
263 Class<? extends Writable> outputValueClass, Job job,
264 boolean addDependencyJars) throws IOException {
265 job.setInputFormatClass(MultiTableInputFormat.class);
266 if (outputValueClass != null) {
267 job.setMapOutputValueClass(outputValueClass);
268 }
269 if (outputKeyClass != null) {
270 job.setMapOutputKeyClass(outputKeyClass);
271 }
272 job.setMapperClass(mapper);
273 HBaseConfiguration.addHbaseResources(job.getConfiguration());
274 List<String> scanStrings = new ArrayList<String>();
275
276 for (Scan scan : scans) {
277 scanStrings.add(convertScanToString(scan));
278 }
279 job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
280 scanStrings.toArray(new String[scanStrings.size()]));
281
282 if (addDependencyJars) {
283 addDependencyJars(job);
284 }
285 }
286
287 public static void initCredentials(Job job) throws IOException {
288 UserProvider provider = UserProvider.instantiate(job.getConfiguration());
289 if (provider.isHBaseSecurityEnabled()) {
290 try {
291
292 String quorumAddress = job.getConfiguration().get(
293 TableOutputFormat.QUORUM_ADDRESS);
294 User user = provider.getCurrent();
295 if (quorumAddress != null) {
296 String[] parts = ZKUtil.transformClusterKey(quorumAddress);
297 Configuration peerConf = HBaseConfiguration.create(job
298 .getConfiguration());
299 peerConf.set(HConstants.ZOOKEEPER_QUORUM, parts[0]);
300 peerConf.set("hbase.zookeeper.client.port", parts[1]);
301 peerConf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, parts[2]);
302 user.obtainAuthTokenForJob(peerConf, job);
303 }
304
305 user.obtainAuthTokenForJob(job.getConfiguration(), job);
306 } catch (InterruptedException ie) {
307 LOG.info("Interrupted obtaining user authentication token");
308 Thread.interrupted();
309 }
310 }
311 }
312
313
314
315
316
317
318
319
320 static String convertScanToString(Scan scan) throws IOException {
321 ByteArrayOutputStream out = new ByteArrayOutputStream();
322 DataOutputStream dos = new DataOutputStream(out);
323 scan.write(dos);
324 return Base64.encodeBytes(out.toByteArray());
325 }
326
327
328
329
330
331
332
333
334 static Scan convertStringToScan(String base64) throws IOException {
335 ByteArrayInputStream bis = new ByteArrayInputStream(Base64.decode(base64));
336 DataInputStream dis = new DataInputStream(bis);
337 Scan scan = new Scan();
338 scan.readFields(dis);
339 return scan;
340 }
341
342
343
344
345
346
347
348
349
350
351 public static void initTableReducerJob(String table,
352 Class<? extends TableReducer> reducer, Job job)
353 throws IOException {
354 initTableReducerJob(table, reducer, job, null);
355 }
356
357
358
359
360
361
362
363
364
365
366
367
368 public static void initTableReducerJob(String table,
369 Class<? extends TableReducer> reducer, Job job,
370 Class partitioner) throws IOException {
371 initTableReducerJob(table, reducer, job, partitioner, null, null, null);
372 }
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397 public static void initTableReducerJob(String table,
398 Class<? extends TableReducer> reducer, Job job,
399 Class partitioner, String quorumAddress, String serverClass,
400 String serverImpl) throws IOException {
401 initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
402 serverClass, serverImpl, true);
403 }
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430 public static void initTableReducerJob(String table,
431 Class<? extends TableReducer> reducer, Job job,
432 Class partitioner, String quorumAddress, String serverClass,
433 String serverImpl, boolean addDependencyJars) throws IOException {
434
435 Configuration conf = job.getConfiguration();
436 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
437 job.setOutputFormatClass(TableOutputFormat.class);
438 if (reducer != null) job.setReducerClass(reducer);
439 conf.set(TableOutputFormat.OUTPUT_TABLE, table);
440
441 if (quorumAddress != null) {
442
443 ZKUtil.transformClusterKey(quorumAddress);
444 conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
445 }
446 if (serverClass != null && serverImpl != null) {
447 conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
448 conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
449 }
450 job.setOutputKeyClass(ImmutableBytesWritable.class);
451 job.setOutputValueClass(Writable.class);
452 if (partitioner == HRegionPartitioner.class) {
453 job.setPartitionerClass(HRegionPartitioner.class);
454 HTable outputTable = new HTable(conf, table);
455 int regions = outputTable.getRegionsInfo().size();
456 if (job.getNumReduceTasks() > regions) {
457 job.setNumReduceTasks(outputTable.getRegionsInfo().size());
458 }
459 } else if (partitioner != null) {
460 job.setPartitionerClass(partitioner);
461 }
462
463 if (addDependencyJars) {
464 addDependencyJars(job);
465 }
466
467 initCredentials(job);
468 }
469
470
471
472
473
474
475
476
477
478 public static void limitNumReduceTasks(String table, Job job)
479 throws IOException {
480 HTable outputTable = new HTable(job.getConfiguration(), table);
481 int regions = outputTable.getRegionsInfo().size();
482 if (job.getNumReduceTasks() > regions)
483 job.setNumReduceTasks(regions);
484 }
485
486
487
488
489
490
491
492
493
494 public static void setNumReduceTasks(String table, Job job)
495 throws IOException {
496 HTable outputTable = new HTable(job.getConfiguration(), table);
497 int regions = outputTable.getRegionsInfo().size();
498 job.setNumReduceTasks(regions);
499 }
500
501
502
503
504
505
506
507
508
509
510 public static void setScannerCaching(Job job, int batchSize) {
511 job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
512 }
513
514
515
516
517
518
519 public static void addDependencyJars(Job job) throws IOException {
520 try {
521 addDependencyJars(job.getConfiguration(),
522 org.apache.zookeeper.ZooKeeper.class,
523 com.google.protobuf.Message.class,
524 com.google.common.collect.ImmutableSet.class,
525 org.apache.hadoop.hbase.util.Bytes.class,
526 job.getMapOutputKeyClass(),
527 job.getMapOutputValueClass(),
528 job.getInputFormatClass(),
529 job.getOutputKeyClass(),
530 job.getOutputValueClass(),
531 job.getOutputFormatClass(),
532 job.getPartitionerClass(),
533 job.getCombinerClass());
534 } catch (ClassNotFoundException e) {
535 throw new IOException(e);
536 }
537 }
538
539
540
541
542
543
544 public static void addDependencyJars(Configuration conf,
545 Class<?>... classes) throws IOException {
546
547 FileSystem localFs = FileSystem.getLocal(conf);
548 Set<String> jars = new HashSet<String>();
549
550 jars.addAll(conf.getStringCollection("tmpjars"));
551
552
553
554 Map<String, String> packagedClasses = new HashMap<String, String>();
555
556
557 for (Class<?> clazz : classes) {
558 if (clazz == null) continue;
559
560 Path path = findOrCreateJar(clazz, localFs, packagedClasses);
561 if (path == null) {
562 LOG.warn("Could not find jar for class " + clazz +
563 " in order to ship it to the cluster.");
564 continue;
565 }
566 if (!localFs.exists(path)) {
567 LOG.warn("Could not validate jar file " + path + " for class "
568 + clazz);
569 continue;
570 }
571 jars.add(path.toString());
572 }
573 if (jars.isEmpty()) return;
574
575 conf.set("tmpjars",
576 StringUtils.arrayToString(jars.toArray(new String[0])));
577 }
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593 private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
594 Map<String, String> packagedClasses)
595 throws IOException {
596
597 String jar = findContainingJar(my_class, packagedClasses);
598 if (null == jar || jar.isEmpty()) {
599 jar = getJar(my_class);
600 updateMap(jar, packagedClasses);
601 }
602
603 if (null == jar || jar.isEmpty()) {
604 throw new IOException("Cannot locate resource for class " + my_class.getName());
605 }
606
607 LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
608 return new Path(jar).makeQualified(fs);
609 }
610
611
612
613
614
615
616
617 private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
618 ZipFile zip = null;
619 try {
620 zip = new ZipFile(jar);
621 for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
622 ZipEntry entry = iter.nextElement();
623 if (entry.getName().endsWith("class")) {
624 packagedClasses.put(entry.getName(), jar);
625 }
626 }
627 } finally {
628 if (null != zip) zip.close();
629 }
630 }
631
632
633
634
635
636
637
638
639
640
641 private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
642 throws IOException {
643 ClassLoader loader = my_class.getClassLoader();
644 String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
645
646
647 for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
648 URL url = itr.nextElement();
649 if ("jar".equals(url.getProtocol())) {
650 String toReturn = url.getPath();
651 if (toReturn.startsWith("file:")) {
652 toReturn = toReturn.substring("file:".length());
653 }
654
655
656
657
658
659
660 toReturn = toReturn.replaceAll("\\+", "%2B");
661 toReturn = URLDecoder.decode(toReturn, "UTF-8");
662 return toReturn.replaceAll("!.*$", "");
663 }
664 }
665
666
667
668 return packagedClasses.get(class_file);
669 }
670
671
672
673
674
675
676
677
678 private static String getJar(Class<?> my_class) {
679 String ret = null;
680 String hadoopJarFinder = "org.apache.hadoop.util.JarFinder";
681 Class<?> jarFinder = null;
682 try {
683 LOG.debug("Looking for " + hadoopJarFinder + ".");
684 jarFinder = Class.forName(hadoopJarFinder);
685 LOG.debug(hadoopJarFinder + " found.");
686 Method getJar = jarFinder.getMethod("getJar", Class.class);
687 ret = (String) getJar.invoke(null, my_class);
688 } catch (ClassNotFoundException e) {
689 LOG.debug("Using backported JarFinder.");
690 ret = JarFinder.getJar(my_class);
691 } catch (InvocationTargetException e) {
692
693
694 throw new RuntimeException(e.getCause());
695 } catch (Exception e) {
696
697 throw new RuntimeException("getJar invocation failed.", e);
698 }
699
700 return ret;
701 }
702 }