1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.IOException;
22 import java.io.UnsupportedEncodingException;
23 import java.net.URLDecoder;
24 import java.net.URLEncoder;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.TreeMap;
30 import java.util.TreeSet;
31 import java.util.UUID;
32
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.classification.InterfaceAudience;
36 import org.apache.hadoop.classification.InterfaceStability;
37 import org.apache.hadoop.conf.Configuration;
38 import org.apache.hadoop.fs.FileSystem;
39 import org.apache.hadoop.fs.Path;
40 import org.apache.hadoop.hbase.HColumnDescriptor;
41 import org.apache.hadoop.hbase.HConstants;
42 import org.apache.hadoop.hbase.HTableDescriptor;
43 import org.apache.hadoop.hbase.KeyValue;
44 import org.apache.hadoop.hbase.client.HTable;
45 import org.apache.hadoop.hbase.client.Put;
46 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
47 import org.apache.hadoop.hbase.io.compress.Compression;
48 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
49 import org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter;
50 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
51 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoder;
52 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoderImpl;
53 import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
54 import org.apache.hadoop.hbase.regionserver.BloomType;
55 import org.apache.hadoop.hbase.regionserver.HStore;
56 import org.apache.hadoop.hbase.regionserver.StoreFile;
57 import org.apache.hadoop.hbase.util.Bytes;
58 import org.apache.hadoop.io.NullWritable;
59 import org.apache.hadoop.io.SequenceFile;
60 import org.apache.hadoop.io.Text;
61 import org.apache.hadoop.mapreduce.Job;
62 import org.apache.hadoop.mapreduce.RecordWriter;
63 import org.apache.hadoop.mapreduce.TaskAttemptContext;
64 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
65 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
66 import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
67
68
69
70
71
72
73
74
75
76
77
78 @InterfaceAudience.Public
79 @InterfaceStability.Stable
80 public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {
81 static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
82 static final String COMPRESSION_CONF_KEY = "hbase.hfileoutputformat.families.compression";
83 private static final String BLOOM_TYPE_CONF_KEY = "hbase.hfileoutputformat.families.bloomtype";
84 private static final String DATABLOCK_ENCODING_CONF_KEY =
85 "hbase.mapreduce.hfileoutputformat.datablock.encoding";
86 private static final String BLOCK_SIZE_CONF_KEY = "hbase.mapreduce.hfileoutputformat.blocksize";
87
88 public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context)
89 throws IOException, InterruptedException {
90
91 final Path outputPath = FileOutputFormat.getOutputPath(context);
92 final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
93 final Configuration conf = context.getConfiguration();
94 final FileSystem fs = outputdir.getFileSystem(conf);
95
96 final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE,
97 HConstants.DEFAULT_MAX_FILE_SIZE);
98
99 final String defaultCompression = conf.get("hfile.compression",
100 Compression.Algorithm.NONE.getName());
101 final boolean compactionExclude = conf.getBoolean(
102 "hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
103
104
105 final Map<byte[], String> compressionMap = createFamilyCompressionMap(conf);
106 final Map<byte[], String> bloomTypeMap = createFamilyBloomMap(conf);
107 final Map<byte[], String> blockSizeMap = createFamilyBlockSizeMap(conf);
108
109 String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_CONF_KEY);
110 final HFileDataBlockEncoder encoder;
111 if (dataBlockEncodingStr == null) {
112 encoder = NoOpDataBlockEncoder.INSTANCE;
113 } else {
114 try {
115 encoder = new HFileDataBlockEncoderImpl(DataBlockEncoding
116 .valueOf(dataBlockEncodingStr));
117 } catch (IllegalArgumentException ex) {
118 throw new RuntimeException(
119 "Invalid data block encoding type configured for the param "
120 + DATABLOCK_ENCODING_CONF_KEY + " : " + dataBlockEncodingStr);
121 }
122 }
123
124 return new RecordWriter<ImmutableBytesWritable, KeyValue>() {
125
126 private final Map<byte [], WriterLength> writers =
127 new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
128 private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;
129 private final byte [] now = Bytes.toBytes(System.currentTimeMillis());
130 private boolean rollRequested = false;
131
132 public void write(ImmutableBytesWritable row, KeyValue kv)
133 throws IOException {
134
135 if (row == null && kv == null) {
136 rollWriters();
137 return;
138 }
139
140 byte [] rowKey = kv.getRow();
141 long length = kv.getLength();
142 byte [] family = kv.getFamily();
143 WriterLength wl = this.writers.get(family);
144
145
146 if (wl == null) {
147 fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
148 }
149
150
151
152 if (wl != null && wl.written + length >= maxsize) {
153 this.rollRequested = true;
154 }
155
156
157 if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
158 rollWriters();
159 }
160
161
162 if (wl == null || wl.writer == null) {
163 wl = getNewWriter(family, conf);
164 }
165
166
167 kv.updateLatestStamp(this.now);
168 wl.writer.append(kv);
169 wl.written += length;
170
171
172 this.previousRow = rowKey;
173 }
174
175 private void rollWriters() throws IOException {
176 for (WriterLength wl : this.writers.values()) {
177 if (wl.writer != null) {
178 LOG.info("Writer=" + wl.writer.getPath() +
179 ((wl.written == 0)? "": ", wrote=" + wl.written));
180 close(wl.writer);
181 }
182 wl.writer = null;
183 wl.written = 0;
184 }
185 this.rollRequested = false;
186 }
187
188
189
190
191
192
193 private WriterLength getNewWriter(byte[] family, Configuration conf)
194 throws IOException {
195 WriterLength wl = new WriterLength();
196 Path familydir = new Path(outputdir, Bytes.toString(family));
197 String compression = compressionMap.get(family);
198 compression = compression == null ? defaultCompression : compression;
199 String bloomTypeStr = bloomTypeMap.get(family);
200 BloomType bloomType = BloomType.NONE;
201 if (bloomTypeStr != null) {
202 bloomType = BloomType.valueOf(bloomTypeStr);
203 }
204 String blockSizeString = blockSizeMap.get(family);
205 int blockSize = blockSizeString == null ? HConstants.DEFAULT_BLOCKSIZE
206 : Integer.parseInt(blockSizeString);
207 Configuration tempConf = new Configuration(conf);
208 tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
209 wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs, blockSize)
210 .withOutputDir(familydir)
211 .withCompression(AbstractHFileWriter.compressionByName(compression))
212 .withBloomType(bloomType)
213 .withComparator(KeyValue.COMPARATOR)
214 .withDataBlockEncoder(encoder)
215 .withChecksumType(HStore.getChecksumType(conf))
216 .withBytesPerChecksum(HStore.getBytesPerChecksum(conf))
217 .build();
218
219 this.writers.put(family, wl);
220 return wl;
221 }
222
223 private void close(final StoreFile.Writer w) throws IOException {
224 if (w != null) {
225 w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
226 Bytes.toBytes(System.currentTimeMillis()));
227 w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
228 Bytes.toBytes(context.getTaskAttemptID().toString()));
229 w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
230 Bytes.toBytes(true));
231 w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY,
232 Bytes.toBytes(compactionExclude));
233 w.appendTrackedTimestampsToMetadata();
234 w.close();
235 }
236 }
237
238 public void close(TaskAttemptContext c)
239 throws IOException, InterruptedException {
240 for (WriterLength wl: this.writers.values()) {
241 close(wl.writer);
242 }
243 }
244 };
245 }
246
247
248
249
250 static class WriterLength {
251 long written = 0;
252 StoreFile.Writer writer = null;
253 }
254
255
256
257
258
259 private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)
260 throws IOException {
261 byte[][] byteKeys = table.getStartKeys();
262 ArrayList<ImmutableBytesWritable> ret =
263 new ArrayList<ImmutableBytesWritable>(byteKeys.length);
264 for (byte[] byteKey : byteKeys) {
265 ret.add(new ImmutableBytesWritable(byteKey));
266 }
267 return ret;
268 }
269
270
271
272
273
274 private static void writePartitions(Configuration conf, Path partitionsPath,
275 List<ImmutableBytesWritable> startKeys) throws IOException {
276 LOG.info("Writing partition information to " + partitionsPath);
277 if (startKeys.isEmpty()) {
278 throw new IllegalArgumentException("No regions passed");
279 }
280
281
282
283
284
285 TreeSet<ImmutableBytesWritable> sorted =
286 new TreeSet<ImmutableBytesWritable>(startKeys);
287
288 ImmutableBytesWritable first = sorted.first();
289 if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
290 throw new IllegalArgumentException(
291 "First region of table should have empty start key. Instead has: "
292 + Bytes.toStringBinary(first.get()));
293 }
294 sorted.remove(first);
295
296
297 FileSystem fs = partitionsPath.getFileSystem(conf);
298 SequenceFile.Writer writer = SequenceFile.createWriter(fs,
299 conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
300
301 try {
302 for (ImmutableBytesWritable startKey : sorted) {
303 writer.append(startKey, NullWritable.get());
304 }
305 } finally {
306 writer.close();
307 }
308 }
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324 public static void configureIncrementalLoad(Job job, HTable table)
325 throws IOException {
326 Configuration conf = job.getConfiguration();
327
328 job.setOutputKeyClass(ImmutableBytesWritable.class);
329 job.setOutputValueClass(KeyValue.class);
330 job.setOutputFormatClass(HFileOutputFormat.class);
331
332
333
334
335 if (KeyValue.class.equals(job.getMapOutputValueClass())) {
336 job.setReducerClass(KeyValueSortReducer.class);
337 } else if (Put.class.equals(job.getMapOutputValueClass())) {
338 job.setReducerClass(PutSortReducer.class);
339 } else if (Text.class.equals(job.getMapOutputValueClass())) {
340 job.setReducerClass(TextSortReducer.class);
341 } else {
342 LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
343 }
344
345 conf.setStrings("io.serializations", conf.get("io.serializations"),
346 MutationSerialization.class.getName(), ResultSerialization.class.getName(),
347 KeyValueSerialization.class.getName());
348
349
350 LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName()));
351 List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
352 LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
353 "to match current region count");
354 job.setNumReduceTasks(startKeys.size());
355
356 configurePartitioner(job, startKeys);
357
358 configureCompression(table, conf);
359 configureBloomType(table, conf);
360 configureBlockSize(table, conf);
361
362 TableMapReduceUtil.addDependencyJars(job);
363 TableMapReduceUtil.initCredentials(job);
364 LOG.info("Incremental table " + Bytes.toString(table.getTableName()) + " output configured.");
365 }
366
367 private static void configureBlockSize(HTable table, Configuration conf) throws IOException {
368 StringBuilder blockSizeConfigValue = new StringBuilder();
369 HTableDescriptor tableDescriptor = table.getTableDescriptor();
370 if(tableDescriptor == null){
371
372 return;
373 }
374 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
375 int i = 0;
376 for (HColumnDescriptor familyDescriptor : families) {
377 if (i++ > 0) {
378 blockSizeConfigValue.append('&');
379 }
380 blockSizeConfigValue.append(URLEncoder.encode(
381 familyDescriptor.getNameAsString(), "UTF-8"));
382 blockSizeConfigValue.append('=');
383 blockSizeConfigValue.append(URLEncoder.encode(
384 String.valueOf(familyDescriptor.getBlocksize()), "UTF-8"));
385 }
386
387 conf.set(BLOCK_SIZE_CONF_KEY, blockSizeConfigValue.toString());
388 }
389
390
391
392
393
394
395
396
397
398
399
400 static Map<byte[], String> createFamilyCompressionMap(Configuration conf) {
401 return createFamilyConfValueMap(conf, COMPRESSION_CONF_KEY);
402 }
403
404 private static Map<byte[], String> createFamilyBloomMap(Configuration conf) {
405 return createFamilyConfValueMap(conf, BLOOM_TYPE_CONF_KEY);
406 }
407
408 private static Map<byte[], String> createFamilyBlockSizeMap(Configuration conf) {
409 return createFamilyConfValueMap(conf, BLOCK_SIZE_CONF_KEY);
410 }
411
412
413
414
415
416
417
418
419 private static Map<byte[], String> createFamilyConfValueMap(Configuration conf, String confName) {
420 Map<byte[], String> confValMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
421 String confVal = conf.get(confName, "");
422 for (String familyConf : confVal.split("&")) {
423 String[] familySplit = familyConf.split("=");
424 if (familySplit.length != 2) {
425 continue;
426 }
427 try {
428 confValMap.put(URLDecoder.decode(familySplit[0], "UTF-8").getBytes(),
429 URLDecoder.decode(familySplit[1], "UTF-8"));
430 } catch (UnsupportedEncodingException e) {
431
432 throw new AssertionError(e);
433 }
434 }
435 return confValMap;
436 }
437
438
439
440
441
442 static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
443 throws IOException {
444
445
446 FileSystem fs = FileSystem.get(job.getConfiguration());
447 Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
448 fs.makeQualified(partitionsPath);
449 fs.deleteOnExit(partitionsPath);
450 writePartitions(job.getConfiguration(), partitionsPath, splitPoints);
451
452
453 job.setPartitionerClass(TotalOrderPartitioner.class);
454 TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
455 }
456
457
458
459
460
461
462
463
464
465
466 @edu.umd.cs.findbugs.annotations.SuppressWarnings(
467 value="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
468 static void configureCompression(HTable table, Configuration conf) throws IOException {
469 StringBuilder compressionConfigValue = new StringBuilder();
470 HTableDescriptor tableDescriptor = table.getTableDescriptor();
471 if(tableDescriptor == null){
472
473 return;
474 }
475 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
476 int i = 0;
477 for (HColumnDescriptor familyDescriptor : families) {
478 if (i++ > 0) {
479 compressionConfigValue.append('&');
480 }
481 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
482 compressionConfigValue.append('=');
483 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getCompression().getName(), "UTF-8"));
484 }
485
486 conf.set(COMPRESSION_CONF_KEY, compressionConfigValue.toString());
487 }
488
489
490
491
492
493
494
495
496 static void configureBloomType(HTable table, Configuration conf) throws IOException {
497 HTableDescriptor tableDescriptor = table.getTableDescriptor();
498 if (tableDescriptor == null) {
499
500 return;
501 }
502 StringBuilder bloomTypeConfigValue = new StringBuilder();
503 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
504 int i = 0;
505 for (HColumnDescriptor familyDescriptor : families) {
506 if (i++ > 0) {
507 bloomTypeConfigValue.append('&');
508 }
509 bloomTypeConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
510 bloomTypeConfigValue.append('=');
511 String bloomType = familyDescriptor.getBloomFilterType().toString();
512 if (bloomType == null) {
513 bloomType = HColumnDescriptor.DEFAULT_BLOOMFILTER;
514 }
515 bloomTypeConfigValue.append(URLEncoder.encode(bloomType, "UTF-8"));
516 }
517 conf.set(BLOOM_TYPE_CONF_KEY, bloomTypeConfigValue.toString());
518 }
519 }