1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.IOException;
22 import java.io.UnsupportedEncodingException;
23 import java.net.URLDecoder;
24 import java.net.URLEncoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.TreeMap;
30 import java.util.TreeSet;
31 import java.util.UUID;
32
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.classification.InterfaceAudience;
36 import org.apache.hadoop.classification.InterfaceStability;
37 import org.apache.hadoop.conf.Configuration;
38 import org.apache.hadoop.fs.FileSystem;
39 import org.apache.hadoop.fs.Path;
40 import org.apache.hadoop.hbase.Cell;
41 import org.apache.hadoop.hbase.HColumnDescriptor;
42 import org.apache.hadoop.hbase.HConstants;
43 import org.apache.hadoop.hbase.HTableDescriptor;
44 import org.apache.hadoop.hbase.KeyValue;
45 import org.apache.hadoop.hbase.KeyValueUtil;
46 import org.apache.hadoop.hbase.client.HTable;
47 import org.apache.hadoop.hbase.client.Put;
48 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
49 import org.apache.hadoop.hbase.io.compress.Compression;
50 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
51 import org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter;
52 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
53 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoder;
54 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoderImpl;
55 import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
56 import org.apache.hadoop.hbase.regionserver.BloomType;
57 import org.apache.hadoop.hbase.regionserver.HStore;
58 import org.apache.hadoop.hbase.regionserver.StoreFile;
59 import org.apache.hadoop.hbase.util.Bytes;
60 import org.apache.hadoop.io.NullWritable;
61 import org.apache.hadoop.io.SequenceFile;
62 import org.apache.hadoop.io.Text;
63 import org.apache.hadoop.mapreduce.Job;
64 import org.apache.hadoop.mapreduce.OutputFormat;
65 import org.apache.hadoop.mapreduce.RecordWriter;
66 import org.apache.hadoop.mapreduce.TaskAttemptContext;
67 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
68 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
69 import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
70
71
72
73
74
75
76
77
78
79
80
81 @InterfaceAudience.Public
82 @InterfaceStability.Stable
83 public class HFileOutputFormat2 extends FileOutputFormat<ImmutableBytesWritable, Cell> {
84 static Log LOG = LogFactory.getLog(HFileOutputFormat2.class);
85 static final String COMPRESSION_CONF_KEY = "hbase.hfileoutputformat.families.compression";
86 private static final String BLOOM_TYPE_CONF_KEY = "hbase.hfileoutputformat.families.bloomtype";
87 private static final String DATABLOCK_ENCODING_CONF_KEY =
88 "hbase.mapreduce.hfileoutputformat.datablock.encoding";
89 private static final String BLOCK_SIZE_CONF_KEY = "hbase.mapreduce.hfileoutputformat.blocksize";
90
91 public RecordWriter<ImmutableBytesWritable, Cell> getRecordWriter(final TaskAttemptContext context)
92 throws IOException, InterruptedException {
93 return createRecordWriter(context);
94 }
95
96 static <V extends Cell> RecordWriter<ImmutableBytesWritable, V>
97 createRecordWriter(final TaskAttemptContext context)
98 throws IOException, InterruptedException {
99
100 final Path outputPath = FileOutputFormat.getOutputPath(context);
101 final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
102 final Configuration conf = context.getConfiguration();
103 final FileSystem fs = outputdir.getFileSystem(conf);
104
105 final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE,
106 HConstants.DEFAULT_MAX_FILE_SIZE);
107
108 final String defaultCompression = conf.get("hfile.compression",
109 Compression.Algorithm.NONE.getName());
110 final boolean compactionExclude = conf.getBoolean(
111 "hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
112
113
114 final Map<byte[], String> compressionMap = createFamilyCompressionMap(conf);
115 final Map<byte[], String> bloomTypeMap = createFamilyBloomMap(conf);
116 final Map<byte[], String> blockSizeMap = createFamilyBlockSizeMap(conf);
117
118 String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_CONF_KEY);
119 final HFileDataBlockEncoder encoder;
120 if (dataBlockEncodingStr == null) {
121 encoder = NoOpDataBlockEncoder.INSTANCE;
122 } else {
123 try {
124 encoder = new HFileDataBlockEncoderImpl(DataBlockEncoding
125 .valueOf(dataBlockEncodingStr));
126 } catch (IllegalArgumentException ex) {
127 throw new RuntimeException(
128 "Invalid data block encoding type configured for the param "
129 + DATABLOCK_ENCODING_CONF_KEY + " : " + dataBlockEncodingStr);
130 }
131 }
132
133 return new RecordWriter<ImmutableBytesWritable, V>() {
134
135 private final Map<byte [], WriterLength> writers =
136 new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
137 private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;
138 private final byte [] now = Bytes.toBytes(System.currentTimeMillis());
139 private boolean rollRequested = false;
140
141 public void write(ImmutableBytesWritable row, V cell)
142 throws IOException {
143 KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
144
145
146 if (row == null && kv == null) {
147 rollWriters();
148 return;
149 }
150
151 byte [] rowKey = kv.getRow();
152 long length = kv.getLength();
153 byte [] family = kv.getFamily();
154 WriterLength wl = this.writers.get(family);
155
156
157 if (wl == null) {
158 fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
159 }
160
161
162
163 if (wl != null && wl.written + length >= maxsize) {
164 this.rollRequested = true;
165 }
166
167
168 if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
169 rollWriters();
170 }
171
172
173 if (wl == null || wl.writer == null) {
174 wl = getNewWriter(family, conf);
175 }
176
177
178 kv.updateLatestStamp(this.now);
179 wl.writer.append(kv);
180 wl.written += length;
181
182
183 this.previousRow = rowKey;
184 }
185
186 private void rollWriters() throws IOException {
187 for (WriterLength wl : this.writers.values()) {
188 if (wl.writer != null) {
189 LOG.info("Writer=" + wl.writer.getPath() +
190 ((wl.written == 0)? "": ", wrote=" + wl.written));
191 close(wl.writer);
192 }
193 wl.writer = null;
194 wl.written = 0;
195 }
196 this.rollRequested = false;
197 }
198
199
200
201
202
203
204 private WriterLength getNewWriter(byte[] family, Configuration conf)
205 throws IOException {
206 WriterLength wl = new WriterLength();
207 Path familydir = new Path(outputdir, Bytes.toString(family));
208 String compression = compressionMap.get(family);
209 compression = compression == null ? defaultCompression : compression;
210 String bloomTypeStr = bloomTypeMap.get(family);
211 BloomType bloomType = BloomType.NONE;
212 if (bloomTypeStr != null) {
213 bloomType = BloomType.valueOf(bloomTypeStr);
214 }
215 String blockSizeString = blockSizeMap.get(family);
216 int blockSize = blockSizeString == null ? HConstants.DEFAULT_BLOCKSIZE
217 : Integer.parseInt(blockSizeString);
218 Configuration tempConf = new Configuration(conf);
219 tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
220 wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs, blockSize)
221 .withOutputDir(familydir)
222 .withCompression(AbstractHFileWriter.compressionByName(compression))
223 .withBloomType(bloomType)
224 .withComparator(KeyValue.COMPARATOR)
225 .withDataBlockEncoder(encoder)
226 .withChecksumType(HStore.getChecksumType(conf))
227 .withBytesPerChecksum(HStore.getBytesPerChecksum(conf))
228 .build();
229
230 this.writers.put(family, wl);
231 return wl;
232 }
233
234 private void close(final StoreFile.Writer w) throws IOException {
235 if (w != null) {
236 w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
237 Bytes.toBytes(System.currentTimeMillis()));
238 w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
239 Bytes.toBytes(context.getTaskAttemptID().toString()));
240 w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
241 Bytes.toBytes(true));
242 w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY,
243 Bytes.toBytes(compactionExclude));
244 w.appendTrackedTimestampsToMetadata();
245 w.close();
246 }
247 }
248
249 public void close(TaskAttemptContext c)
250 throws IOException, InterruptedException {
251 for (WriterLength wl: this.writers.values()) {
252 close(wl.writer);
253 }
254 }
255 };
256 }
257
258
259
260
261 static class WriterLength {
262 long written = 0;
263 StoreFile.Writer writer = null;
264 }
265
266
267
268
269
270 private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)
271 throws IOException {
272 byte[][] byteKeys = table.getStartKeys();
273 ArrayList<ImmutableBytesWritable> ret =
274 new ArrayList<ImmutableBytesWritable>(byteKeys.length);
275 for (byte[] byteKey : byteKeys) {
276 ret.add(new ImmutableBytesWritable(byteKey));
277 }
278 return ret;
279 }
280
281
282
283
284
285 private static void writePartitions(Configuration conf, Path partitionsPath,
286 List<ImmutableBytesWritable> startKeys) throws IOException {
287 LOG.info("Writing partition information to " + partitionsPath);
288 if (startKeys.isEmpty()) {
289 throw new IllegalArgumentException("No regions passed");
290 }
291
292
293
294
295
296 TreeSet<ImmutableBytesWritable> sorted =
297 new TreeSet<ImmutableBytesWritable>(startKeys);
298
299 ImmutableBytesWritable first = sorted.first();
300 if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
301 throw new IllegalArgumentException(
302 "First region of table should have empty start key. Instead has: "
303 + Bytes.toStringBinary(first.get()));
304 }
305 sorted.remove(first);
306
307
308 FileSystem fs = partitionsPath.getFileSystem(conf);
309 SequenceFile.Writer writer = SequenceFile.createWriter(fs,
310 conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
311
312 try {
313 for (ImmutableBytesWritable startKey : sorted) {
314 writer.append(startKey, NullWritable.get());
315 }
316 } finally {
317 writer.close();
318 }
319 }
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335 public static void configureIncrementalLoad(Job job, HTable table)
336 throws IOException {
337 configureIncrementalLoad(job, table, HFileOutputFormat2.class);
338 }
339
340 static void configureIncrementalLoad(Job job, HTable table,
341 Class<? extends OutputFormat<?, ?>> cls) throws IOException {
342 Configuration conf = job.getConfiguration();
343
344 job.setOutputKeyClass(ImmutableBytesWritable.class);
345 job.setOutputValueClass(KeyValue.class);
346 job.setOutputFormatClass(HFileOutputFormat2.class);
347
348
349
350
351 if (KeyValue.class.equals(job.getMapOutputValueClass())) {
352 job.setReducerClass(KeyValueSortReducer.class);
353 } else if (Put.class.equals(job.getMapOutputValueClass())) {
354 job.setReducerClass(PutSortReducer.class);
355 } else if (Text.class.equals(job.getMapOutputValueClass())) {
356 job.setReducerClass(TextSortReducer.class);
357 } else {
358 LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
359 }
360
361 conf.setStrings("io.serializations", conf.get("io.serializations"),
362 MutationSerialization.class.getName(), ResultSerialization.class.getName(),
363 KeyValueSerialization.class.getName());
364
365
366 LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName()));
367 List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
368 LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
369 "to match current region count");
370 job.setNumReduceTasks(startKeys.size());
371
372 configurePartitioner(job, startKeys);
373
374 configureCompression(table, conf);
375 configureBloomType(table, conf);
376 configureBlockSize(table, conf);
377
378 TableMapReduceUtil.addDependencyJars(job);
379 TableMapReduceUtil.initCredentials(job);
380 LOG.info("Incremental table " + Bytes.toString(table.getTableName()) + " output configured.");
381 }
382
383 private static void configureBlockSize(HTable table, Configuration conf) throws IOException {
384 StringBuilder blockSizeConfigValue = new StringBuilder();
385 HTableDescriptor tableDescriptor = table.getTableDescriptor();
386 if(tableDescriptor == null){
387
388 return;
389 }
390 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
391 int i = 0;
392 for (HColumnDescriptor familyDescriptor : families) {
393 if (i++ > 0) {
394 blockSizeConfigValue.append('&');
395 }
396 blockSizeConfigValue.append(URLEncoder.encode(
397 familyDescriptor.getNameAsString(), "UTF-8"));
398 blockSizeConfigValue.append('=');
399 blockSizeConfigValue.append(URLEncoder.encode(
400 String.valueOf(familyDescriptor.getBlocksize()), "UTF-8"));
401 }
402
403 conf.set(BLOCK_SIZE_CONF_KEY, blockSizeConfigValue.toString());
404 }
405
406
407
408
409
410
411
412
413
414
415
416 static Map<byte[], String> createFamilyCompressionMap(Configuration conf) {
417 return createFamilyConfValueMap(conf, COMPRESSION_CONF_KEY);
418 }
419
420 private static Map<byte[], String> createFamilyBloomMap(Configuration conf) {
421 return createFamilyConfValueMap(conf, BLOOM_TYPE_CONF_KEY);
422 }
423
424 private static Map<byte[], String> createFamilyBlockSizeMap(Configuration conf) {
425 return createFamilyConfValueMap(conf, BLOCK_SIZE_CONF_KEY);
426 }
427
428
429
430
431
432
433
434
435 private static Map<byte[], String> createFamilyConfValueMap(Configuration conf, String confName) {
436 Map<byte[], String> confValMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
437 String confVal = conf.get(confName, "");
438 for (String familyConf : confVal.split("&")) {
439 String[] familySplit = familyConf.split("=");
440 if (familySplit.length != 2) {
441 continue;
442 }
443 try {
444 confValMap.put(URLDecoder.decode(familySplit[0], "UTF-8").getBytes(),
445 URLDecoder.decode(familySplit[1], "UTF-8"));
446 } catch (UnsupportedEncodingException e) {
447
448 throw new AssertionError(e);
449 }
450 }
451 return confValMap;
452 }
453
454
455
456
457
458 static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
459 throws IOException {
460
461
462 FileSystem fs = FileSystem.get(job.getConfiguration());
463 Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
464 fs.makeQualified(partitionsPath);
465 fs.deleteOnExit(partitionsPath);
466 writePartitions(job.getConfiguration(), partitionsPath, splitPoints);
467
468
469 job.setPartitionerClass(TotalOrderPartitioner.class);
470 TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
471 }
472
473
474
475
476
477
478
479
480
481
482 @edu.umd.cs.findbugs.annotations.SuppressWarnings(
483 value="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
484 static void configureCompression(HTable table, Configuration conf) throws IOException {
485 StringBuilder compressionConfigValue = new StringBuilder();
486 HTableDescriptor tableDescriptor = table.getTableDescriptor();
487 if(tableDescriptor == null){
488
489 return;
490 }
491 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
492 int i = 0;
493 for (HColumnDescriptor familyDescriptor : families) {
494 if (i++ > 0) {
495 compressionConfigValue.append('&');
496 }
497 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
498 compressionConfigValue.append('=');
499 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getCompression().getName(), "UTF-8"));
500 }
501
502 conf.set(COMPRESSION_CONF_KEY, compressionConfigValue.toString());
503 }
504
505
506
507
508
509
510
511
512 static void configureBloomType(HTable table, Configuration conf) throws IOException {
513 HTableDescriptor tableDescriptor = table.getTableDescriptor();
514 if (tableDescriptor == null) {
515
516 return;
517 }
518 StringBuilder bloomTypeConfigValue = new StringBuilder();
519 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
520 int i = 0;
521 for (HColumnDescriptor familyDescriptor : families) {
522 if (i++ > 0) {
523 bloomTypeConfigValue.append('&');
524 }
525 bloomTypeConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
526 bloomTypeConfigValue.append('=');
527 String bloomType = familyDescriptor.getBloomFilterType().toString();
528 if (bloomType == null) {
529 bloomType = HColumnDescriptor.DEFAULT_BLOOMFILTER;
530 }
531 bloomTypeConfigValue.append(URLEncoder.encode(bloomType, "UTF-8"));
532 }
533 conf.set(BLOOM_TYPE_CONF_KEY, bloomTypeConfigValue.toString());
534 }
535 }