1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import com.google.common.annotations.VisibleForTesting;
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URLDecoder;
25 import java.net.URLEncoder;
26 import java.util.ArrayList;
27 import java.util.Collection;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.TreeMap;
31 import java.util.TreeSet;
32 import java.util.UUID;
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.classification.InterfaceAudience;
36 import org.apache.hadoop.classification.InterfaceStability;
37 import org.apache.hadoop.conf.Configuration;
38 import org.apache.hadoop.fs.FileSystem;
39 import org.apache.hadoop.fs.Path;
40 import org.apache.hadoop.hbase.HColumnDescriptor;
41 import org.apache.hadoop.hbase.HConstants;
42 import org.apache.hadoop.hbase.HTableDescriptor;
43 import org.apache.hadoop.hbase.KeyValue;
44 import org.apache.hadoop.hbase.client.HTable;
45 import org.apache.hadoop.hbase.client.Put;
46 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
47 import org.apache.hadoop.hbase.io.compress.Compression;
48 import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
49 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
50 import org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter;
51 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
52 import org.apache.hadoop.hbase.io.hfile.HFileContext;
53 import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
54 import org.apache.hadoop.hbase.regionserver.BloomType;
55 import org.apache.hadoop.hbase.regionserver.HStore;
56 import org.apache.hadoop.hbase.regionserver.StoreFile;
57 import org.apache.hadoop.hbase.util.Bytes;
58 import org.apache.hadoop.io.NullWritable;
59 import org.apache.hadoop.io.SequenceFile;
60 import org.apache.hadoop.io.Text;
61 import org.apache.hadoop.mapreduce.Job;
62 import org.apache.hadoop.mapreduce.RecordWriter;
63 import org.apache.hadoop.mapreduce.TaskAttemptContext;
64 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
65 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
66 import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
67
68
69
70
71
72
73
74
75
76
77
78 @InterfaceAudience.Public
79 @InterfaceStability.Stable
80 public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {
81 static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
82
83
84
85
86
87 private static final String COMPRESSION_FAMILIES_CONF_KEY =
88 "hbase.hfileoutputformat.families.compression";
89 private static final String BLOOM_TYPE_FAMILIES_CONF_KEY =
90 "hbase.hfileoutputformat.families.bloomtype";
91 private static final String BLOCK_SIZE_FAMILIES_CONF_KEY =
92 "hbase.mapreduce.hfileoutputformat.blocksize";
93 private static final String DATABLOCK_ENCODING_FAMILIES_CONF_KEY =
94 "hbase.mapreduce.hfileoutputformat.families.datablock.encoding";
95
96
97
98
99
100 public static final String DATABLOCK_ENCODING_OVERRIDE_CONF_KEY =
101 "hbase.mapreduce.hfileoutputformat.datablock.encoding";
102
103 public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context)
104 throws IOException, InterruptedException {
105
106 final Path outputPath = FileOutputFormat.getOutputPath(context);
107 final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
108 final Configuration conf = context.getConfiguration();
109 final FileSystem fs = outputdir.getFileSystem(conf);
110
111 final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE,
112 HConstants.DEFAULT_MAX_FILE_SIZE);
113
114 final String defaultCompressionStr = conf.get("hfile.compression",
115 Compression.Algorithm.NONE.getName());
116 final Algorithm defaultCompression = AbstractHFileWriter
117 .compressionByName(defaultCompressionStr);
118 final boolean compactionExclude = conf.getBoolean(
119 "hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
120
121
122 final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf);
123 final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf);
124 final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf);
125
126 String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY);
127 final Map<byte[], DataBlockEncoding> datablockEncodingMap
128 = createFamilyDataBlockEncodingMap(conf);
129 final DataBlockEncoding overriddenEncoding;
130 if (dataBlockEncodingStr != null) {
131 overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr);
132 } else {
133 overriddenEncoding = null;
134 }
135
136 return new RecordWriter<ImmutableBytesWritable, KeyValue>() {
137
138 private final Map<byte [], WriterLength> writers =
139 new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
140 private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;
141 private final byte [] now = Bytes.toBytes(System.currentTimeMillis());
142 private boolean rollRequested = false;
143
144 public void write(ImmutableBytesWritable row, KeyValue kv)
145 throws IOException {
146
147 if (row == null && kv == null) {
148 rollWriters();
149 return;
150 }
151
152 byte [] rowKey = kv.getRow();
153 long length = kv.getLength();
154 byte [] family = kv.getFamily();
155 WriterLength wl = this.writers.get(family);
156
157
158 if (wl == null) {
159 fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
160 }
161
162
163
164 if (wl != null && wl.written + length >= maxsize) {
165 this.rollRequested = true;
166 }
167
168
169 if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
170 rollWriters();
171 }
172
173
174 if (wl == null || wl.writer == null) {
175 wl = getNewWriter(family, conf);
176 }
177
178
179 kv.updateLatestStamp(this.now);
180 wl.writer.append(kv);
181 wl.written += length;
182
183
184 this.previousRow = rowKey;
185 }
186
187 private void rollWriters() throws IOException {
188 for (WriterLength wl : this.writers.values()) {
189 if (wl.writer != null) {
190 LOG.info("Writer=" + wl.writer.getPath() +
191 ((wl.written == 0)? "": ", wrote=" + wl.written));
192 close(wl.writer);
193 }
194 wl.writer = null;
195 wl.written = 0;
196 }
197 this.rollRequested = false;
198 }
199
200
201
202
203
204
205 private WriterLength getNewWriter(byte[] family, Configuration conf)
206 throws IOException {
207 WriterLength wl = new WriterLength();
208 Path familydir = new Path(outputdir, Bytes.toString(family));
209 Algorithm compression = compressionMap.get(family);
210 compression = compression == null ? defaultCompression : compression;
211 BloomType bloomType = bloomTypeMap.get(family);
212 bloomType = bloomType == null ? BloomType.NONE : bloomType;
213 Integer blockSize = blockSizeMap.get(family);
214 blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize;
215 DataBlockEncoding encoding = overriddenEncoding;
216 encoding = encoding == null ? datablockEncodingMap.get(family) : encoding;
217 encoding = encoding == null ? DataBlockEncoding.NONE : encoding;
218 Configuration tempConf = new Configuration(conf);
219 tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
220 HFileContextBuilder contextBuilder = new HFileContextBuilder()
221 .withCompression(compression)
222 .withChecksumType(HStore.getChecksumType(conf))
223 .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf))
224 .withBlockSize(blockSize);
225 contextBuilder.withDataBlockEncoding(encoding);
226 HFileContext hFileContext = contextBuilder.build();
227
228 wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs)
229 .withOutputDir(familydir).withBloomType(bloomType).withComparator(KeyValue.COMPARATOR)
230 .withFileContext(hFileContext)
231 .build();
232
233 this.writers.put(family, wl);
234 return wl;
235 }
236
237 private void close(final StoreFile.Writer w) throws IOException {
238 if (w != null) {
239 w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
240 Bytes.toBytes(System.currentTimeMillis()));
241 w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
242 Bytes.toBytes(context.getTaskAttemptID().toString()));
243 w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
244 Bytes.toBytes(true));
245 w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY,
246 Bytes.toBytes(compactionExclude));
247 w.appendTrackedTimestampsToMetadata();
248 w.close();
249 }
250 }
251
252 public void close(TaskAttemptContext c)
253 throws IOException, InterruptedException {
254 for (WriterLength wl: this.writers.values()) {
255 close(wl.writer);
256 }
257 }
258 };
259 }
260
261
262
263
264 static class WriterLength {
265 long written = 0;
266 StoreFile.Writer writer = null;
267 }
268
269
270
271
272
273 private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)
274 throws IOException {
275 byte[][] byteKeys = table.getStartKeys();
276 ArrayList<ImmutableBytesWritable> ret =
277 new ArrayList<ImmutableBytesWritable>(byteKeys.length);
278 for (byte[] byteKey : byteKeys) {
279 ret.add(new ImmutableBytesWritable(byteKey));
280 }
281 return ret;
282 }
283
284
285
286
287
288 private static void writePartitions(Configuration conf, Path partitionsPath,
289 List<ImmutableBytesWritable> startKeys) throws IOException {
290 LOG.info("Writing partition information to " + partitionsPath);
291 if (startKeys.isEmpty()) {
292 throw new IllegalArgumentException("No regions passed");
293 }
294
295
296
297
298
299 TreeSet<ImmutableBytesWritable> sorted =
300 new TreeSet<ImmutableBytesWritable>(startKeys);
301
302 ImmutableBytesWritable first = sorted.first();
303 if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
304 throw new IllegalArgumentException(
305 "First region of table should have empty start key. Instead has: "
306 + Bytes.toStringBinary(first.get()));
307 }
308 sorted.remove(first);
309
310
311 FileSystem fs = partitionsPath.getFileSystem(conf);
312 SequenceFile.Writer writer = SequenceFile.createWriter(fs,
313 conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
314
315 try {
316 for (ImmutableBytesWritable startKey : sorted) {
317 writer.append(startKey, NullWritable.get());
318 }
319 } finally {
320 writer.close();
321 }
322 }
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338 public static void configureIncrementalLoad(Job job, HTable table)
339 throws IOException {
340 Configuration conf = job.getConfiguration();
341
342 job.setOutputKeyClass(ImmutableBytesWritable.class);
343 job.setOutputValueClass(KeyValue.class);
344 job.setOutputFormatClass(HFileOutputFormat.class);
345
346
347
348
349 if (KeyValue.class.equals(job.getMapOutputValueClass())) {
350 job.setReducerClass(KeyValueSortReducer.class);
351 } else if (Put.class.equals(job.getMapOutputValueClass())) {
352 job.setReducerClass(PutSortReducer.class);
353 } else if (Text.class.equals(job.getMapOutputValueClass())) {
354 job.setReducerClass(TextSortReducer.class);
355 } else {
356 LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
357 }
358
359 conf.setStrings("io.serializations", conf.get("io.serializations"),
360 MutationSerialization.class.getName(), ResultSerialization.class.getName(),
361 KeyValueSerialization.class.getName());
362
363
364 LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName()));
365 List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
366 LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
367 "to match current region count");
368 job.setNumReduceTasks(startKeys.size());
369
370 configurePartitioner(job, startKeys);
371
372 configureCompression(table, conf);
373 configureBloomType(table, conf);
374 configureBlockSize(table, conf);
375 configureDataBlockEncoding(table, conf);
376
377 TableMapReduceUtil.addDependencyJars(job);
378 TableMapReduceUtil.initCredentials(job);
379 LOG.info("Incremental table " + Bytes.toString(table.getTableName()) + " output configured.");
380 }
381
382
383
384
385
386
387
388
389 @VisibleForTesting
390 static Map<byte[], Algorithm> createFamilyCompressionMap(Configuration
391 conf) {
392 Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
393 COMPRESSION_FAMILIES_CONF_KEY);
394 Map<byte[], Algorithm> compressionMap = new TreeMap<byte[],
395 Algorithm>(Bytes.BYTES_COMPARATOR);
396 for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
397 Algorithm algorithm = AbstractHFileWriter.compressionByName
398 (e.getValue());
399 compressionMap.put(e.getKey(), algorithm);
400 }
401 return compressionMap;
402 }
403
404
405
406
407
408
409
410
411 @VisibleForTesting
412 static Map<byte[], BloomType> createFamilyBloomTypeMap(Configuration conf) {
413 Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
414 BLOOM_TYPE_FAMILIES_CONF_KEY);
415 Map<byte[], BloomType> bloomTypeMap = new TreeMap<byte[],
416 BloomType>(Bytes.BYTES_COMPARATOR);
417 for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
418 BloomType bloomType = BloomType.valueOf(e.getValue());
419 bloomTypeMap.put(e.getKey(), bloomType);
420 }
421 return bloomTypeMap;
422 }
423
424
425
426
427
428
429
430
431 @VisibleForTesting
432 static Map<byte[], Integer> createFamilyBlockSizeMap(Configuration conf) {
433 Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
434 BLOCK_SIZE_FAMILIES_CONF_KEY);
435 Map<byte[], Integer> blockSizeMap = new TreeMap<byte[],
436 Integer>(Bytes.BYTES_COMPARATOR);
437 for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
438 Integer blockSize = Integer.parseInt(e.getValue());
439 blockSizeMap.put(e.getKey(), blockSize);
440 }
441 return blockSizeMap;
442 }
443
444
445
446
447
448
449
450
451
452 @VisibleForTesting
453 static Map<byte[], DataBlockEncoding> createFamilyDataBlockEncodingMap(
454 Configuration conf) {
455 Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
456 DATABLOCK_ENCODING_FAMILIES_CONF_KEY);
457 Map<byte[], DataBlockEncoding> encoderMap = new TreeMap<byte[],
458 DataBlockEncoding>(Bytes.BYTES_COMPARATOR);
459 for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
460 encoderMap.put(e.getKey(), DataBlockEncoding.valueOf((e.getValue())));
461 }
462 return encoderMap;
463 }
464
465
466
467
468
469
470
471
472
473 private static Map<byte[], String> createFamilyConfValueMap(Configuration conf, String confName) {
474 Map<byte[], String> confValMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
475 String confVal = conf.get(confName, "");
476 for (String familyConf : confVal.split("&")) {
477 String[] familySplit = familyConf.split("=");
478 if (familySplit.length != 2) {
479 continue;
480 }
481 try {
482 confValMap.put(URLDecoder.decode(familySplit[0], "UTF-8").getBytes(),
483 URLDecoder.decode(familySplit[1], "UTF-8"));
484 } catch (UnsupportedEncodingException e) {
485
486 throw new AssertionError(e);
487 }
488 }
489 return confValMap;
490 }
491
492
493
494
495
496 static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
497 throws IOException {
498
499
500 FileSystem fs = FileSystem.get(job.getConfiguration());
501 Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
502 fs.makeQualified(partitionsPath);
503 fs.deleteOnExit(partitionsPath);
504 writePartitions(job.getConfiguration(), partitionsPath, splitPoints);
505
506
507 job.setPartitionerClass(TotalOrderPartitioner.class);
508 TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
509 }
510
511
512
513
514
515
516
517
518
519
520 @edu.umd.cs.findbugs.annotations.SuppressWarnings(
521 value="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
522 @VisibleForTesting
523 static void configureCompression(HTable table, Configuration conf) throws IOException {
524 StringBuilder compressionConfigValue = new StringBuilder();
525 HTableDescriptor tableDescriptor = table.getTableDescriptor();
526 if(tableDescriptor == null){
527
528 return;
529 }
530 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
531 int i = 0;
532 for (HColumnDescriptor familyDescriptor : families) {
533 if (i++ > 0) {
534 compressionConfigValue.append('&');
535 }
536 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
537 compressionConfigValue.append('=');
538 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getCompression().getName(), "UTF-8"));
539 }
540
541 conf.set(COMPRESSION_FAMILIES_CONF_KEY, compressionConfigValue.toString());
542 }
543
544
545
546
547
548
549
550
551
552
553 @VisibleForTesting
554 static void configureBlockSize(HTable table, Configuration conf) throws IOException {
555 StringBuilder blockSizeConfigValue = new StringBuilder();
556 HTableDescriptor tableDescriptor = table.getTableDescriptor();
557 if (tableDescriptor == null) {
558
559 return;
560 }
561 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
562 int i = 0;
563 for (HColumnDescriptor familyDescriptor : families) {
564 if (i++ > 0) {
565 blockSizeConfigValue.append('&');
566 }
567 blockSizeConfigValue.append(URLEncoder.encode(
568 familyDescriptor.getNameAsString(), "UTF-8"));
569 blockSizeConfigValue.append('=');
570 blockSizeConfigValue.append(URLEncoder.encode(
571 String.valueOf(familyDescriptor.getBlocksize()), "UTF-8"));
572 }
573
574 conf.set(BLOCK_SIZE_FAMILIES_CONF_KEY, blockSizeConfigValue.toString());
575 }
576
577
578
579
580
581
582
583
584
585
586 @VisibleForTesting
587 static void configureBloomType(HTable table, Configuration conf) throws IOException {
588 HTableDescriptor tableDescriptor = table.getTableDescriptor();
589 if (tableDescriptor == null) {
590
591 return;
592 }
593 StringBuilder bloomTypeConfigValue = new StringBuilder();
594 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
595 int i = 0;
596 for (HColumnDescriptor familyDescriptor : families) {
597 if (i++ > 0) {
598 bloomTypeConfigValue.append('&');
599 }
600 bloomTypeConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
601 bloomTypeConfigValue.append('=');
602 String bloomType = familyDescriptor.getBloomFilterType().toString();
603 if (bloomType == null) {
604 bloomType = HColumnDescriptor.DEFAULT_BLOOMFILTER;
605 }
606 bloomTypeConfigValue.append(URLEncoder.encode(bloomType, "UTF-8"));
607 }
608 conf.set(BLOOM_TYPE_FAMILIES_CONF_KEY, bloomTypeConfigValue.toString());
609 }
610
611
612
613
614
615
616
617
618
619
620 @VisibleForTesting
621 static void configureDataBlockEncoding(HTable table,
622 Configuration conf) throws IOException {
623 HTableDescriptor tableDescriptor = table.getTableDescriptor();
624 if (tableDescriptor == null) {
625
626 return;
627 }
628 StringBuilder dataBlockEncodingConfigValue = new StringBuilder();
629 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
630 int i = 0;
631 for (HColumnDescriptor familyDescriptor : families) {
632 if (i++ > 0) {
633 dataBlockEncodingConfigValue.append('&');
634 }
635 dataBlockEncodingConfigValue.append(
636 URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
637 dataBlockEncodingConfigValue.append('=');
638 DataBlockEncoding encoding = familyDescriptor.getDataBlockEncoding();
639 if (encoding == null) {
640 encoding = DataBlockEncoding.NONE;
641 }
642 dataBlockEncodingConfigValue.append(URLEncoder.encode(encoding.toString(),
643 "UTF-8"));
644 }
645 conf.set(DATABLOCK_ENCODING_FAMILIES_CONF_KEY,
646 dataBlockEncodingConfigValue.toString());
647 }
648 }