1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.mapreduce;
21
22 import java.io.IOException;
23 import java.io.UnsupportedEncodingException;
24 import java.net.URI;
25 import java.net.URISyntaxException;
26 import java.net.URLDecoder;
27 import java.net.URLEncoder;
28 import java.util.ArrayList;
29 import java.util.Collection;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.TreeMap;
33 import java.util.TreeSet;
34 import java.util.UUID;
35
36 import org.apache.commons.logging.Log;
37 import org.apache.commons.logging.LogFactory;
38 import org.apache.hadoop.conf.Configuration;
39 import org.apache.hadoop.filecache.DistributedCache;
40 import org.apache.hadoop.fs.FileSystem;
41 import org.apache.hadoop.fs.Path;
42 import org.apache.hadoop.hbase.HColumnDescriptor;
43 import org.apache.hadoop.hbase.HConstants;
44 import org.apache.hadoop.hbase.HTableDescriptor;
45 import org.apache.hadoop.hbase.KeyValue;
46 import org.apache.hadoop.hbase.client.HTable;
47 import org.apache.hadoop.hbase.client.Put;
48 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
49 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
50 import org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter;
51 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
52 import org.apache.hadoop.hbase.io.hfile.Compression;
53 import org.apache.hadoop.hbase.io.hfile.HFile;
54 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoder;
55 import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoderImpl;
56 import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
57 import org.apache.hadoop.hbase.regionserver.Store;
58 import org.apache.hadoop.hbase.regionserver.StoreFile;
59 import org.apache.hadoop.hbase.regionserver.StoreFile.BloomType;
60 import org.apache.hadoop.hbase.util.Bytes;
61 import org.apache.hadoop.io.NullWritable;
62 import org.apache.hadoop.io.SequenceFile;
63 import org.apache.hadoop.mapreduce.Job;
64 import org.apache.hadoop.mapreduce.Partitioner;
65 import org.apache.hadoop.mapreduce.RecordWriter;
66 import org.apache.hadoop.mapreduce.TaskAttemptContext;
67 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
68 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
69
70
71
72
73
74
75
76
77
78
79 public class HFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {
80 static Log LOG = LogFactory.getLog(HFileOutputFormat.class);
81 static final String COMPRESSION_CONF_KEY = "hbase.hfileoutputformat.families.compression";
82 private static final String BLOOM_TYPE_CONF_KEY = "hbase.hfileoutputformat.families.bloomtype";
83 private static final String DATABLOCK_ENCODING_CONF_KEY =
84 "hbase.mapreduce.hfileoutputformat.datablock.encoding";
85
86 public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context)
87 throws IOException, InterruptedException {
88
89 final Path outputPath = FileOutputFormat.getOutputPath(context);
90 final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
91 final Configuration conf = context.getConfiguration();
92 final FileSystem fs = outputdir.getFileSystem(conf);
93
94 final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE,
95 HConstants.DEFAULT_MAX_FILE_SIZE);
96 final int blocksize = conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize",
97 HFile.DEFAULT_BLOCKSIZE);
98
99 final String defaultCompression = conf.get("hfile.compression",
100 Compression.Algorithm.NONE.getName());
101 final boolean compactionExclude = conf.getBoolean(
102 "hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
103
104
105 final Map<byte[], String> compressionMap = createFamilyCompressionMap(conf);
106 final Map<byte[], String> bloomTypeMap = createFamilyBloomMap(conf);
107
108 String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_CONF_KEY);
109 final HFileDataBlockEncoder encoder;
110 if (dataBlockEncodingStr == null) {
111 encoder = NoOpDataBlockEncoder.INSTANCE;
112 } else {
113 try {
114 encoder = new HFileDataBlockEncoderImpl(DataBlockEncoding
115 .valueOf(dataBlockEncodingStr));
116 } catch (IllegalArgumentException ex) {
117 throw new RuntimeException(
118 "Invalid data block encoding type configured for the param "
119 + DATABLOCK_ENCODING_CONF_KEY + " : "
120 + dataBlockEncodingStr);
121 }
122 }
123
124 return new RecordWriter<ImmutableBytesWritable, KeyValue>() {
125
126 private final Map<byte [], WriterLength> writers =
127 new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
128 private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;
129 private final byte [] now = Bytes.toBytes(System.currentTimeMillis());
130 private boolean rollRequested = false;
131
132 public void write(ImmutableBytesWritable row, KeyValue kv)
133 throws IOException {
134
135 if (row == null && kv == null) {
136 rollWriters();
137 return;
138 }
139
140 byte [] rowKey = kv.getRow();
141 long length = kv.getLength();
142 byte [] family = kv.getFamily();
143 WriterLength wl = this.writers.get(family);
144
145
146 if (wl == null) {
147 fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
148 }
149
150
151
152 if (wl != null && wl.written + length >= maxsize) {
153 this.rollRequested = true;
154 }
155
156
157 if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
158 rollWriters();
159 }
160
161
162 if (wl == null || wl.writer == null) {
163 wl = getNewWriter(family, conf);
164 }
165
166
167 kv.updateLatestStamp(this.now);
168 wl.writer.append(kv);
169 wl.written += length;
170
171
172 this.previousRow = rowKey;
173 }
174
175 private void rollWriters() throws IOException {
176 for (WriterLength wl : this.writers.values()) {
177 if (wl.writer != null) {
178 LOG.info("Writer=" + wl.writer.getPath() +
179 ((wl.written == 0)? "": ", wrote=" + wl.written));
180 close(wl.writer);
181 }
182 wl.writer = null;
183 wl.written = 0;
184 }
185 this.rollRequested = false;
186 }
187
188
189
190
191
192
193 private WriterLength getNewWriter(byte[] family, Configuration conf)
194 throws IOException {
195 WriterLength wl = new WriterLength();
196 Path familydir = new Path(outputdir, Bytes.toString(family));
197 String compression = compressionMap.get(family);
198 compression = compression == null ? defaultCompression : compression;
199 String bloomTypeStr = bloomTypeMap.get(family);
200 BloomType bloomType = BloomType.NONE;
201 if (bloomTypeStr != null) {
202 bloomType = BloomType.valueOf(bloomTypeStr);
203 }
204 Configuration tempConf = new Configuration(conf);
205 tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
206 wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs, blocksize)
207 .withOutputDir(familydir)
208 .withCompression(AbstractHFileWriter.compressionByName(compression))
209 .withBloomType(bloomType)
210 .withComparator(KeyValue.COMPARATOR)
211 .withDataBlockEncoder(encoder)
212 .withChecksumType(Store.getChecksumType(conf))
213 .withBytesPerChecksum(Store.getBytesPerChecksum(conf))
214 .build();
215
216 this.writers.put(family, wl);
217 return wl;
218 }
219
220 private void close(final StoreFile.Writer w) throws IOException {
221 if (w != null) {
222 w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
223 Bytes.toBytes(System.currentTimeMillis()));
224 w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
225 Bytes.toBytes(context.getTaskAttemptID().toString()));
226 w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
227 Bytes.toBytes(true));
228 w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY,
229 Bytes.toBytes(compactionExclude));
230 w.appendTrackedTimestampsToMetadata();
231 w.close();
232 }
233 }
234
235 public void close(TaskAttemptContext c)
236 throws IOException, InterruptedException {
237 for (WriterLength wl: this.writers.values()) {
238 close(wl.writer);
239 }
240 }
241 };
242 }
243
244
245
246
247 static class WriterLength {
248 long written = 0;
249 StoreFile.Writer writer = null;
250 }
251
252
253
254
255
256 private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)
257 throws IOException {
258 byte[][] byteKeys = table.getStartKeys();
259 ArrayList<ImmutableBytesWritable> ret =
260 new ArrayList<ImmutableBytesWritable>(byteKeys.length);
261 for (byte[] byteKey : byteKeys) {
262 ret.add(new ImmutableBytesWritable(byteKey));
263 }
264 return ret;
265 }
266
267
268
269
270
271
272
273 private static void writePartitions(Configuration conf, Path partitionsPath,
274 List<ImmutableBytesWritable> startKeys) throws IOException {
275 if (startKeys.isEmpty()) {
276 throw new IllegalArgumentException("No regions passed");
277 }
278
279
280
281
282
283 TreeSet<ImmutableBytesWritable> sorted =
284 new TreeSet<ImmutableBytesWritable>(startKeys);
285
286 ImmutableBytesWritable first = sorted.first();
287 if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
288 throw new IllegalArgumentException(
289 "First region of table should have empty start key. Instead has: "
290 + Bytes.toStringBinary(first.get()));
291 }
292 sorted.remove(first);
293
294
295 FileSystem fs = partitionsPath.getFileSystem(conf);
296 SequenceFile.Writer writer = SequenceFile.createWriter(fs,
297 conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
298
299 try {
300 for (ImmutableBytesWritable startKey : sorted) {
301 writer.append(startKey, NullWritable.get());
302 }
303 } finally {
304 writer.close();
305 }
306 }
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322 public static void configureIncrementalLoad(Job job, HTable table)
323 throws IOException {
324 Configuration conf = job.getConfiguration();
325 Class<? extends Partitioner> topClass;
326 try {
327 topClass = getTotalOrderPartitionerClass();
328 } catch (ClassNotFoundException e) {
329 throw new IOException("Failed getting TotalOrderPartitioner", e);
330 }
331 job.setPartitionerClass(topClass);
332 job.setOutputKeyClass(ImmutableBytesWritable.class);
333 job.setOutputValueClass(KeyValue.class);
334 job.setOutputFormatClass(HFileOutputFormat.class);
335
336
337
338
339 if (KeyValue.class.equals(job.getMapOutputValueClass())) {
340 job.setReducerClass(KeyValueSortReducer.class);
341 } else if (Put.class.equals(job.getMapOutputValueClass())) {
342 job.setReducerClass(PutSortReducer.class);
343 } else {
344 LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
345 }
346
347 LOG.info("Looking up current regions for table " + table);
348 List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
349 LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
350 "to match current region count");
351 job.setNumReduceTasks(startKeys.size());
352
353 Path partitionsPath = new Path(job.getWorkingDirectory(),
354 "partitions_" + UUID.randomUUID());
355 LOG.info("Writing partition information to " + partitionsPath);
356
357 FileSystem fs = partitionsPath.getFileSystem(conf);
358 writePartitions(conf, partitionsPath, startKeys);
359 partitionsPath.makeQualified(fs);
360
361 URI cacheUri;
362 try {
363
364
365
366 cacheUri = new URI(partitionsPath.toString() + "#" +
367 org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH);
368 } catch (URISyntaxException e) {
369 throw new IOException(e);
370 }
371 DistributedCache.addCacheFile(cacheUri, conf);
372 DistributedCache.createSymlink(conf);
373
374
375 configureCompression(table, conf);
376 configureBloomType(table, conf);
377
378 TableMapReduceUtil.addDependencyJars(job);
379 LOG.info("Incremental table output configured.");
380 }
381
382
383
384
385
386
387
388
389
390
391 private static Class<? extends Partitioner> getTotalOrderPartitionerClass()
392 throws ClassNotFoundException {
393 Class<? extends Partitioner> clazz = null;
394 try {
395 clazz = (Class<? extends Partitioner>) Class.forName("org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner");
396 } catch (ClassNotFoundException e) {
397 clazz =
398 (Class<? extends Partitioner>) Class.forName("org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner");
399 }
400 return clazz;
401 }
402
403
404
405
406
407
408
409
410
411
412
413 static Map<byte[], String> createFamilyCompressionMap(Configuration conf) {
414 return createFamilyConfValueMap(conf, COMPRESSION_CONF_KEY);
415 }
416
417 private static Map<byte[], String> createFamilyBloomMap(Configuration conf) {
418 return createFamilyConfValueMap(conf, BLOOM_TYPE_CONF_KEY);
419 }
420
421
422
423
424
425
426
427
428 private static Map<byte[], String> createFamilyConfValueMap(Configuration conf, String confName) {
429 Map<byte[], String> confValMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
430 String confVal = conf.get(confName, "");
431 for (String familyConf : confVal.split("&")) {
432 String[] familySplit = familyConf.split("=");
433 if (familySplit.length != 2) {
434 continue;
435 }
436 try {
437 confValMap.put(URLDecoder.decode(familySplit[0], "UTF-8").getBytes(),
438 URLDecoder.decode(familySplit[1], "UTF-8"));
439 } catch (UnsupportedEncodingException e) {
440
441 throw new AssertionError(e);
442 }
443 }
444 return confValMap;
445 }
446
447
448
449
450
451
452
453
454
455
456 static void configureCompression(HTable table, Configuration conf) throws IOException {
457 StringBuilder compressionConfigValue = new StringBuilder();
458 HTableDescriptor tableDescriptor = table.getTableDescriptor();
459 if(tableDescriptor == null){
460
461 return;
462 }
463 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
464 int i = 0;
465 for (HColumnDescriptor familyDescriptor : families) {
466 if (i++ > 0) {
467 compressionConfigValue.append('&');
468 }
469 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
470 compressionConfigValue.append('=');
471 compressionConfigValue.append(URLEncoder.encode(familyDescriptor.getCompression().getName(), "UTF-8"));
472 }
473
474 conf.set(COMPRESSION_CONF_KEY, compressionConfigValue.toString());
475 }
476
477
478
479
480
481
482
483
484 static void configureBloomType(HTable table, Configuration conf) throws IOException {
485 HTableDescriptor tableDescriptor = table.getTableDescriptor();
486 if (tableDescriptor == null) {
487
488 return;
489 }
490 StringBuilder bloomTypeConfigValue = new StringBuilder();
491 Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
492 int i = 0;
493 for (HColumnDescriptor familyDescriptor : families) {
494 if (i++ > 0) {
495 bloomTypeConfigValue.append('&');
496 }
497 bloomTypeConfigValue.append(URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
498 bloomTypeConfigValue.append('=');
499 String bloomType = familyDescriptor.getBloomFilterType().toString();
500 if (bloomType == null) {
501 bloomType = HColumnDescriptor.DEFAULT_BLOOMFILTER;
502 }
503 bloomTypeConfigValue.append(URLEncoder.encode(bloomType, "UTF-8"));
504 }
505 conf.set(BLOOM_TYPE_CONF_KEY, bloomTypeConfigValue.toString());
506 }
507 }