1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.File;
22 import java.io.IOException;
23 import java.lang.reflect.InvocationTargetException;
24 import java.lang.reflect.Method;
25 import java.net.URL;
26 import java.net.URLDecoder;
27 import java.util.ArrayList;
28 import java.util.Enumeration;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.zip.ZipEntry;
35 import java.util.zip.ZipFile;
36
37 import org.apache.commons.logging.Log;
38 import org.apache.commons.logging.LogFactory;
39 import org.apache.hadoop.classification.InterfaceAudience;
40 import org.apache.hadoop.classification.InterfaceStability;
41 import org.apache.hadoop.conf.Configuration;
42 import org.apache.hadoop.fs.FileSystem;
43 import org.apache.hadoop.fs.Path;
44 import org.apache.hadoop.hbase.HBaseConfiguration;
45 import org.apache.hadoop.hbase.HConstants;
46 import org.apache.hadoop.hbase.catalog.MetaReader;
47 import org.apache.hadoop.hbase.client.Put;
48 import org.apache.hadoop.hbase.client.Scan;
49 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
50 import org.apache.hadoop.hbase.mapreduce.hadoopbackport.JarFinder;
51 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
52 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
53 import org.apache.hadoop.hbase.security.User;
54 import org.apache.hadoop.hbase.security.UserProvider;
55 import org.apache.hadoop.hbase.security.token.AuthenticationTokenIdentifier;
56 import org.apache.hadoop.hbase.security.token.AuthenticationTokenSelector;
57 import org.apache.hadoop.hbase.util.Base64;
58 import org.apache.hadoop.hbase.util.Bytes;
59 import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
60 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
61 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
62 import org.apache.hadoop.io.Text;
63 import org.apache.hadoop.io.Writable;
64 import org.apache.hadoop.io.WritableComparable;
65 import org.apache.hadoop.mapreduce.InputFormat;
66 import org.apache.hadoop.mapreduce.Job;
67 import org.apache.hadoop.security.token.Token;
68 import org.apache.hadoop.util.StringUtils;
69 import org.apache.zookeeper.KeeperException;
70 import org.cliffc.high_scale_lib.Counter;
71
72 import com.google.protobuf.InvalidProtocolBufferException;
73
74
75
76
77 @SuppressWarnings({ "rawtypes", "unchecked" })
78 @InterfaceAudience.Public
79 @InterfaceStability.Stable
80 public class TableMapReduceUtil {
81 static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96 public static void initTableMapperJob(String table, Scan scan,
97 Class<? extends TableMapper> mapper,
98 Class<?> outputKeyClass,
99 Class<?> outputValueClass, Job job)
100 throws IOException {
101 initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
102 job, true);
103 }
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118 public static void initTableMapperJob(byte[] table, Scan scan,
119 Class<? extends TableMapper> mapper,
120 Class<?> outputKeyClass,
121 Class<?> outputValueClass, Job job)
122 throws IOException {
123 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass, outputValueClass,
124 job, true);
125 }
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142 public static void initTableMapperJob(String table, Scan scan,
143 Class<? extends TableMapper> mapper,
144 Class<?> outputKeyClass,
145 Class<?> outputValueClass, Job job,
146 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
147 throws IOException {
148 initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass, job,
149 addDependencyJars, true, inputFormatClass);
150 }
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170 public static void initTableMapperJob(String table, Scan scan,
171 Class<? extends TableMapper> mapper,
172 Class<?> outputKeyClass,
173 Class<?> outputValueClass, Job job,
174 boolean addDependencyJars, boolean initCredentials,
175 Class<? extends InputFormat> inputFormatClass)
176 throws IOException {
177 job.setInputFormatClass(inputFormatClass);
178 if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
179 if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
180 job.setMapperClass(mapper);
181 if (Put.class.equals(outputValueClass)) {
182 job.setCombinerClass(PutCombiner.class);
183 }
184 Configuration conf = job.getConfiguration();
185 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
186 conf.set(TableInputFormat.INPUT_TABLE, table);
187 conf.set(TableInputFormat.SCAN, convertScanToString(scan));
188 conf.setStrings("io.serializations", conf.get("io.serializations"),
189 MutationSerialization.class.getName(), ResultSerialization.class.getName(),
190 KeyValueSerialization.class.getName());
191 if (addDependencyJars) {
192 addDependencyJars(job);
193 }
194 if (initCredentials) {
195 initCredentials(job);
196 }
197 }
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215 public static void initTableMapperJob(byte[] table, Scan scan,
216 Class<? extends TableMapper> mapper,
217 Class<?> outputKeyClass,
218 Class<?> outputValueClass, Job job,
219 boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass)
220 throws IOException {
221 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
222 outputValueClass, job, addDependencyJars, inputFormatClass);
223 }
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240 public static void initTableMapperJob(byte[] table, Scan scan,
241 Class<? extends TableMapper> mapper,
242 Class<?> outputKeyClass,
243 Class<?> outputValueClass, Job job,
244 boolean addDependencyJars)
245 throws IOException {
246 initTableMapperJob(Bytes.toString(table), scan, mapper, outputKeyClass,
247 outputValueClass, job, addDependencyJars, TableInputFormat.class);
248 }
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265 public static void initTableMapperJob(String table, Scan scan,
266 Class<? extends TableMapper> mapper,
267 Class<?> outputKeyClass,
268 Class<?> outputValueClass, Job job,
269 boolean addDependencyJars)
270 throws IOException {
271 initTableMapperJob(table, scan, mapper, outputKeyClass,
272 outputValueClass, job, addDependencyJars, TableInputFormat.class);
273 }
274
275
276
277
278
279
280
281 public static void resetCacheConfig(Configuration conf) {
282 conf.setFloat(
283 HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, HConstants.HFILE_BLOCK_CACHE_SIZE_DEFAULT);
284 conf.setFloat("hbase.offheapcache.percentage", 0f);
285 conf.setFloat("hbase.bucketcache.size", 0f);
286 }
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308 public static void initTableSnapshotMapperJob(String snapshotName, Scan scan,
309 Class<? extends TableMapper> mapper,
310 Class<?> outputKeyClass,
311 Class<?> outputValueClass, Job job,
312 boolean addDependencyJars, Path tmpRestoreDir)
313 throws IOException {
314 TableSnapshotInputFormat.setInput(job, snapshotName, tmpRestoreDir);
315 initTableMapperJob(snapshotName, scan, mapper, outputKeyClass,
316 outputValueClass, job, addDependencyJars, false, TableSnapshotInputFormat.class);
317
318 resetCacheConfig(job.getConfiguration());
319
320 TableMapReduceUtil.addDependencyJars(job.getConfiguration(), Counter.class);
321 }
322
323
324
325
326
327
328
329
330
331
332
333
334
335 public static void initTableMapperJob(List<Scan> scans,
336 Class<? extends TableMapper> mapper,
337 Class<? extends WritableComparable> outputKeyClass,
338 Class<? extends Writable> outputValueClass, Job job) throws IOException {
339 initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
340 true);
341 }
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357 public static void initTableMapperJob(List<Scan> scans,
358 Class<? extends TableMapper> mapper,
359 Class<? extends WritableComparable> outputKeyClass,
360 Class<? extends Writable> outputValueClass, Job job,
361 boolean addDependencyJars) throws IOException {
362 initTableMapperJob(scans, mapper, outputKeyClass, outputValueClass, job,
363 addDependencyJars, true);
364 }
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381 public static void initTableMapperJob(List<Scan> scans,
382 Class<? extends TableMapper> mapper,
383 Class<? extends WritableComparable> outputKeyClass,
384 Class<? extends Writable> outputValueClass, Job job,
385 boolean addDependencyJars,
386 boolean initCredentials) throws IOException {
387 job.setInputFormatClass(MultiTableInputFormat.class);
388 if (outputValueClass != null) {
389 job.setMapOutputValueClass(outputValueClass);
390 }
391 if (outputKeyClass != null) {
392 job.setMapOutputKeyClass(outputKeyClass);
393 }
394 job.setMapperClass(mapper);
395 HBaseConfiguration.addHbaseResources(job.getConfiguration());
396 List<String> scanStrings = new ArrayList<String>();
397
398 for (Scan scan : scans) {
399 scanStrings.add(convertScanToString(scan));
400 }
401 job.getConfiguration().setStrings(MultiTableInputFormat.SCANS,
402 scanStrings.toArray(new String[scanStrings.size()]));
403
404 if (addDependencyJars) {
405 addDependencyJars(job);
406 }
407
408 if (initCredentials) {
409 initCredentials(job);
410 }
411 }
412
413 public static void initCredentials(Job job) throws IOException {
414 UserProvider userProvider = UserProvider.instantiate(job.getConfiguration());
415 if (userProvider.isHadoopSecurityEnabled()) {
416
417 if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
418 job.getConfiguration().set("mapreduce.job.credentials.binary",
419 System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
420 }
421 }
422
423 if (userProvider.isHBaseSecurityEnabled()) {
424 try {
425
426 String quorumAddress = job.getConfiguration().get(TableOutputFormat.QUORUM_ADDRESS);
427 User user = userProvider.getCurrent();
428 if (quorumAddress != null) {
429 Configuration peerConf = HBaseConfiguration.create(job.getConfiguration());
430 ZKUtil.applyClusterKeyToConf(peerConf, quorumAddress);
431 obtainAuthTokenForJob(job, peerConf, user);
432 }
433
434 obtainAuthTokenForJob(job, job.getConfiguration(), user);
435 } catch (InterruptedException ie) {
436 LOG.info("Interrupted obtaining user authentication token");
437 Thread.currentThread().interrupt();
438 }
439 }
440 }
441
442
443
444
445
446
447
448
449
450
451
452
453 public static void initCredentialsForCluster(Job job, String quorumAddress)
454 throws IOException {
455 UserProvider userProvider = UserProvider.instantiate(job.getConfiguration());
456 if (userProvider.isHBaseSecurityEnabled()) {
457 try {
458 Configuration peerConf = HBaseConfiguration.create(job.getConfiguration());
459 ZKUtil.applyClusterKeyToConf(peerConf, quorumAddress);
460 obtainAuthTokenForJob(job, peerConf, userProvider.getCurrent());
461 } catch (InterruptedException e) {
462 LOG.info("Interrupted obtaining user authentication token");
463 Thread.interrupted();
464 }
465 }
466 }
467
468 private static void obtainAuthTokenForJob(Job job, Configuration conf, User user)
469 throws IOException, InterruptedException {
470 Token<AuthenticationTokenIdentifier> authToken = getAuthToken(conf, user);
471 if (authToken == null) {
472 user.obtainAuthTokenForJob(conf, job);
473 } else {
474 job.getCredentials().addToken(authToken.getService(), authToken);
475 }
476 }
477
478
479
480
481
482 private static Token<AuthenticationTokenIdentifier> getAuthToken(Configuration conf, User user)
483 throws IOException, InterruptedException {
484 ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "mr-init-credentials", null);
485 try {
486 String clusterId = ZKClusterId.readClusterIdZNode(zkw);
487 return new AuthenticationTokenSelector().selectToken(new Text(clusterId), user.getUGI().getTokens());
488 } catch (KeeperException e) {
489 throw new IOException(e);
490 } finally {
491 zkw.close();
492 }
493 }
494
495
496
497
498
499
500
501
502 static String convertScanToString(Scan scan) throws IOException {
503 ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
504 return Base64.encodeBytes(proto.toByteArray());
505 }
506
507
508
509
510
511
512
513
514 static Scan convertStringToScan(String base64) throws IOException {
515 byte [] decoded = Base64.decode(base64);
516 ClientProtos.Scan scan;
517 try {
518 scan = ClientProtos.Scan.parseFrom(decoded);
519 } catch (InvalidProtocolBufferException ipbe) {
520 throw new IOException(ipbe);
521 }
522
523 return ProtobufUtil.toScan(scan);
524 }
525
526
527
528
529
530
531
532
533
534
535 public static void initTableReducerJob(String table,
536 Class<? extends TableReducer> reducer, Job job)
537 throws IOException {
538 initTableReducerJob(table, reducer, job, null);
539 }
540
541
542
543
544
545
546
547
548
549
550
551
552 public static void initTableReducerJob(String table,
553 Class<? extends TableReducer> reducer, Job job,
554 Class partitioner) throws IOException {
555 initTableReducerJob(table, reducer, job, partitioner, null, null, null);
556 }
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581 public static void initTableReducerJob(String table,
582 Class<? extends TableReducer> reducer, Job job,
583 Class partitioner, String quorumAddress, String serverClass,
584 String serverImpl) throws IOException {
585 initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
586 serverClass, serverImpl, true);
587 }
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614 public static void initTableReducerJob(String table,
615 Class<? extends TableReducer> reducer, Job job,
616 Class partitioner, String quorumAddress, String serverClass,
617 String serverImpl, boolean addDependencyJars) throws IOException {
618
619 Configuration conf = job.getConfiguration();
620 HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
621 job.setOutputFormatClass(TableOutputFormat.class);
622 if (reducer != null) job.setReducerClass(reducer);
623 conf.set(TableOutputFormat.OUTPUT_TABLE, table);
624 conf.setStrings("io.serializations", conf.get("io.serializations"),
625 MutationSerialization.class.getName(), ResultSerialization.class.getName());
626
627 if (quorumAddress != null) {
628
629 ZKUtil.transformClusterKey(quorumAddress);
630 conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
631 }
632 if (serverClass != null && serverImpl != null) {
633 conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
634 conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
635 }
636 job.setOutputKeyClass(ImmutableBytesWritable.class);
637 job.setOutputValueClass(Writable.class);
638 if (partitioner == HRegionPartitioner.class) {
639 job.setPartitionerClass(HRegionPartitioner.class);
640 int regions = MetaReader.getRegionCount(conf, table);
641 if (job.getNumReduceTasks() > regions) {
642 job.setNumReduceTasks(regions);
643 }
644 } else if (partitioner != null) {
645 job.setPartitionerClass(partitioner);
646 }
647
648 if (addDependencyJars) {
649 addDependencyJars(job);
650 }
651
652 initCredentials(job);
653 }
654
655
656
657
658
659
660
661
662
663 public static void limitNumReduceTasks(String table, Job job)
664 throws IOException {
665 int regions = MetaReader.getRegionCount(job.getConfiguration(), table);
666 if (job.getNumReduceTasks() > regions)
667 job.setNumReduceTasks(regions);
668 }
669
670
671
672
673
674
675
676
677
678 public static void setNumReduceTasks(String table, Job job)
679 throws IOException {
680 job.setNumReduceTasks(MetaReader.getRegionCount(job.getConfiguration(), table));
681 }
682
683
684
685
686
687
688
689
690
691
692 public static void setScannerCaching(Job job, int batchSize) {
693 job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
694 }
695
696
697
698
699
700
701
702
703
704
705
706
707
708 public static void addHBaseDependencyJars(Configuration conf) throws IOException {
709 addDependencyJars(conf,
710
711 org.apache.hadoop.hbase.HConstants.class,
712 org.apache.hadoop.hbase.protobuf.generated.ClientProtos.class,
713 org.apache.hadoop.hbase.client.Put.class,
714 org.apache.hadoop.hbase.CompatibilityFactory.class,
715 org.apache.hadoop.hbase.mapreduce.TableMapper.class,
716
717 org.apache.zookeeper.ZooKeeper.class,
718 org.jboss.netty.channel.ChannelFactory.class,
719 com.google.protobuf.Message.class,
720 com.google.common.collect.Lists.class,
721 org.cloudera.htrace.Trace.class);
722 }
723
724
725
726
727
728 public static String buildDependencyClasspath(Configuration conf) {
729 if (conf == null) {
730 throw new IllegalArgumentException("Must provide a configuration object.");
731 }
732 Set<String> paths = new HashSet<String>(conf.getStringCollection("tmpjars"));
733 if (paths.size() == 0) {
734 throw new IllegalArgumentException("Configuration contains no tmpjars.");
735 }
736 StringBuilder sb = new StringBuilder();
737 for (String s : paths) {
738
739 int idx = s.indexOf(":");
740 if (idx != -1) s = s.substring(idx + 1);
741 if (sb.length() > 0) sb.append(File.pathSeparator);
742 sb.append(s);
743 }
744 return sb.toString();
745 }
746
747
748
749
750
751
752 public static void addDependencyJars(Job job) throws IOException {
753 addHBaseDependencyJars(job.getConfiguration());
754 try {
755 addDependencyJars(job.getConfiguration(),
756
757
758 job.getMapOutputKeyClass(),
759 job.getMapOutputValueClass(),
760 job.getInputFormatClass(),
761 job.getOutputKeyClass(),
762 job.getOutputValueClass(),
763 job.getOutputFormatClass(),
764 job.getPartitionerClass(),
765 job.getCombinerClass());
766 } catch (ClassNotFoundException e) {
767 throw new IOException(e);
768 }
769 }
770
771
772
773
774
775
776 public static void addDependencyJars(Configuration conf,
777 Class<?>... classes) throws IOException {
778
779 FileSystem localFs = FileSystem.getLocal(conf);
780 Set<String> jars = new HashSet<String>();
781
782 jars.addAll(conf.getStringCollection("tmpjars"));
783
784
785
786 Map<String, String> packagedClasses = new HashMap<String, String>();
787
788
789 for (Class<?> clazz : classes) {
790 if (clazz == null) continue;
791
792 Path path = findOrCreateJar(clazz, localFs, packagedClasses);
793 if (path == null) {
794 LOG.warn("Could not find jar for class " + clazz +
795 " in order to ship it to the cluster.");
796 continue;
797 }
798 if (!localFs.exists(path)) {
799 LOG.warn("Could not validate jar file " + path + " for class "
800 + clazz);
801 continue;
802 }
803 jars.add(path.toString());
804 }
805 if (jars.isEmpty()) return;
806
807 conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()])));
808 }
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824 private static Path findOrCreateJar(Class<?> my_class, FileSystem fs,
825 Map<String, String> packagedClasses)
826 throws IOException {
827
828 String jar = findContainingJar(my_class, packagedClasses);
829 if (null == jar || jar.isEmpty()) {
830 jar = getJar(my_class);
831 updateMap(jar, packagedClasses);
832 }
833
834 if (null == jar || jar.isEmpty()) {
835 return null;
836 }
837
838 LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar));
839 return new Path(jar).makeQualified(fs);
840 }
841
842
843
844
845
846
847
848 private static void updateMap(String jar, Map<String, String> packagedClasses) throws IOException {
849 if (null == jar || jar.isEmpty()) {
850 return;
851 }
852 ZipFile zip = null;
853 try {
854 zip = new ZipFile(jar);
855 for (Enumeration<? extends ZipEntry> iter = zip.entries(); iter.hasMoreElements();) {
856 ZipEntry entry = iter.nextElement();
857 if (entry.getName().endsWith("class")) {
858 packagedClasses.put(entry.getName(), jar);
859 }
860 }
861 } finally {
862 if (null != zip) zip.close();
863 }
864 }
865
866
867
868
869
870
871
872
873
874
875 private static String findContainingJar(Class<?> my_class, Map<String, String> packagedClasses)
876 throws IOException {
877 ClassLoader loader = my_class.getClassLoader();
878 String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
879
880
881 for (Enumeration<URL> itr = loader.getResources(class_file); itr.hasMoreElements();) {
882 URL url = itr.nextElement();
883 if ("jar".equals(url.getProtocol())) {
884 String toReturn = url.getPath();
885 if (toReturn.startsWith("file:")) {
886 toReturn = toReturn.substring("file:".length());
887 }
888
889
890
891
892
893
894 toReturn = toReturn.replaceAll("\\+", "%2B");
895 toReturn = URLDecoder.decode(toReturn, "UTF-8");
896 return toReturn.replaceAll("!.*$", "");
897 }
898 }
899
900
901
902 return packagedClasses.get(class_file);
903 }
904
905
906
907
908
909
910
911
912 private static String getJar(Class<?> my_class) {
913 String ret = null;
914 String hadoopJarFinder = "org.apache.hadoop.util.JarFinder";
915 Class<?> jarFinder = null;
916 try {
917 LOG.debug("Looking for " + hadoopJarFinder + ".");
918 jarFinder = Class.forName(hadoopJarFinder);
919 LOG.debug(hadoopJarFinder + " found.");
920 Method getJar = jarFinder.getMethod("getJar", Class.class);
921 ret = (String) getJar.invoke(null, my_class);
922 } catch (ClassNotFoundException e) {
923 LOG.debug("Using backported JarFinder.");
924 ret = JarFinder.getJar(my_class);
925 } catch (InvocationTargetException e) {
926
927
928 throw new RuntimeException(e.getCause());
929 } catch (Exception e) {
930
931 throw new RuntimeException("getJar invocation failed.", e);
932 }
933
934 return ret;
935 }
936 }