View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.mapreduce;
19  
20  import java.io.IOException;
21  import java.io.UnsupportedEncodingException;
22  import java.net.InetSocketAddress;
23  import java.net.URLDecoder;
24  import java.net.URLEncoder;
25  import java.util.ArrayList;
26  import java.util.Collection;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.TreeMap;
30  import java.util.TreeSet;
31  import java.util.UUID;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.hbase.classification.InterfaceAudience;
36  import org.apache.hadoop.hbase.classification.InterfaceStability;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.fs.FileSystem;
39  import org.apache.hadoop.fs.Path;
40  import org.apache.hadoop.hbase.Cell;
41  import org.apache.hadoop.hbase.CellUtil;
42  import org.apache.hadoop.hbase.HColumnDescriptor;
43  import org.apache.hadoop.hbase.HConstants;
44  import org.apache.hadoop.hbase.HRegionLocation;
45  import org.apache.hadoop.hbase.HTableDescriptor;
46  import org.apache.hadoop.hbase.KeyValue;
47  import org.apache.hadoop.hbase.KeyValueUtil;
48  import org.apache.hadoop.hbase.client.HTable;
49  import org.apache.hadoop.hbase.client.Put;
50  import org.apache.hadoop.hbase.fs.HFileSystem;
51  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
52  import org.apache.hadoop.hbase.io.compress.Compression;
53  import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
54  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
55  import org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter;
56  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
57  import org.apache.hadoop.hbase.io.hfile.HFileContext;
58  import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
59  import org.apache.hadoop.hbase.regionserver.BloomType;
60  import org.apache.hadoop.hbase.regionserver.HStore;
61  import org.apache.hadoop.hbase.regionserver.StoreFile;
62  import org.apache.hadoop.hbase.util.Bytes;
63  import org.apache.hadoop.io.NullWritable;
64  import org.apache.hadoop.io.SequenceFile;
65  import org.apache.hadoop.io.Text;
66  import org.apache.hadoop.mapreduce.Job;
67  import org.apache.hadoop.mapreduce.OutputFormat;
68  import org.apache.hadoop.mapreduce.RecordWriter;
69  import org.apache.hadoop.mapreduce.TaskAttemptContext;
70  import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
71  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
72  import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
73  
74  import com.google.common.annotations.VisibleForTesting;
75  
76  /**
77   * Writes HFiles. Passed Cells must arrive in order.
78   * Writes current time as the sequence id for the file. Sets the major compacted
79   * attribute on created hfiles. Calling write(null,null) will forcibly roll
80   * all HFiles being written.
81   * <p>
82   * Using this class as part of a MapReduce job is best done
83   * using {@link #configureIncrementalLoad(Job, HTable)}.
84   */
85  @InterfaceAudience.Public
86  @InterfaceStability.Evolving
87  public class HFileOutputFormat2
88      extends FileOutputFormat<ImmutableBytesWritable, Cell> {
89    static Log LOG = LogFactory.getLog(HFileOutputFormat2.class);
90  
91    // The following constants are private since these are used by
92    // HFileOutputFormat2 to internally transfer data between job setup and
93    // reducer run using conf.
94    // These should not be changed by the client.
95    private static final String COMPRESSION_FAMILIES_CONF_KEY =
96        "hbase.hfileoutputformat.families.compression";
97    private static final String BLOOM_TYPE_FAMILIES_CONF_KEY =
98        "hbase.hfileoutputformat.families.bloomtype";
99    private static final String BLOCK_SIZE_FAMILIES_CONF_KEY =
100       "hbase.mapreduce.hfileoutputformat.blocksize";
101   private static final String DATABLOCK_ENCODING_FAMILIES_CONF_KEY =
102       "hbase.mapreduce.hfileoutputformat.families.datablock.encoding";
103 
104   // This constant is public since the client can modify this when setting
105   // up their conf object and thus refer to this symbol.
106   // It is present for backwards compatibility reasons. Use it only to
107   // override the auto-detection of datablock encoding.
108   public static final String DATABLOCK_ENCODING_OVERRIDE_CONF_KEY =
109       "hbase.mapreduce.hfileoutputformat.datablock.encoding";
110 
111   /**
112    * Keep locality while generating HFiles for bulkload. See HBASE-12596
113    */
114   public static final String LOCALITY_SENSITIVE_CONF_KEY =
115       "hbase.bulkload.locality.sensitive.enabled";
116   private static final boolean DEFAULT_LOCALITY_SENSITIVE = true;
117   private static final String OUTPUT_TABLE_NAME_CONF_KEY =
118       "hbase.mapreduce.hfileoutputformat.table.name";
119 
120   @Override
121   public RecordWriter<ImmutableBytesWritable, Cell> getRecordWriter(
122       final TaskAttemptContext context) throws IOException, InterruptedException {
123     return createRecordWriter(context);
124   }
125 
126   static <V extends Cell> RecordWriter<ImmutableBytesWritable, V>
127       createRecordWriter(final TaskAttemptContext context)
128           throws IOException, InterruptedException {
129 
130     // Get the path of the temporary output file
131     final Path outputPath = FileOutputFormat.getOutputPath(context);
132     final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
133     final Configuration conf = context.getConfiguration();
134     final FileSystem fs = outputdir.getFileSystem(conf);
135     // These configs. are from hbase-*.xml
136     final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE,
137         HConstants.DEFAULT_MAX_FILE_SIZE);
138     // Invented config.  Add to hbase-*.xml if other than default compression.
139     final String defaultCompressionStr = conf.get("hfile.compression",
140         Compression.Algorithm.NONE.getName());
141     final Algorithm defaultCompression = AbstractHFileWriter
142         .compressionByName(defaultCompressionStr);
143     final boolean compactionExclude = conf.getBoolean(
144         "hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
145 
146     // create a map from column family to the compression algorithm
147     final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf);
148     final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf);
149     final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf);
150 
151     String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY);
152     final Map<byte[], DataBlockEncoding> datablockEncodingMap
153         = createFamilyDataBlockEncodingMap(conf);
154     final DataBlockEncoding overriddenEncoding;
155     if (dataBlockEncodingStr != null) {
156       overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr);
157     } else {
158       overriddenEncoding = null;
159     }
160 
161     return new RecordWriter<ImmutableBytesWritable, V>() {
162       // Map of families to writers and how much has been output on the writer.
163       private final Map<byte [], WriterLength> writers =
164         new TreeMap<byte [], WriterLength>(Bytes.BYTES_COMPARATOR);
165       private byte [] previousRow = HConstants.EMPTY_BYTE_ARRAY;
166       private final byte [] now = Bytes.toBytes(System.currentTimeMillis());
167       private boolean rollRequested = false;
168 
169       @Override
170       public void write(ImmutableBytesWritable row, V cell)
171           throws IOException {
172         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
173 
174         // null input == user explicitly wants to flush
175         if (row == null && kv == null) {
176           rollWriters();
177           return;
178         }
179 
180         byte [] rowKey = CellUtil.cloneRow(kv);
181         long length = kv.getLength();
182         byte [] family = CellUtil.cloneFamily(kv);
183         WriterLength wl = this.writers.get(family);
184 
185         // If this is a new column family, verify that the directory exists
186         if (wl == null) {
187           fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
188         }
189 
190         // If any of the HFiles for the column families has reached
191         // maxsize, we need to roll all the writers
192         if (wl != null && wl.written + length >= maxsize) {
193           this.rollRequested = true;
194         }
195 
196         // This can only happen once a row is finished though
197         if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
198           rollWriters();
199         }
200 
201         // create a new HLog writer, if necessary
202         if (wl == null || wl.writer == null) {
203           if (conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) {
204             String tableName = conf.get(OUTPUT_TABLE_NAME_CONF_KEY);
205             HRegionLocation loc = null;
206             HTable htable = null;
207             try {
208               htable = new HTable(conf, tableName);
209               loc = htable.getRegionLocation(rowKey);
210             } catch (Throwable e) {
211               LOG.warn("there's something wrong when locating rowkey: " +
212                 Bytes.toString(rowKey), e);
213               loc = null;
214             } finally {
215               if(null != htable) {
216                 htable.close();
217               }
218             }
219 
220             if (null == loc) {
221               if (LOG.isTraceEnabled()) {
222                 LOG.trace("failed to get region location, so use default writer: " +
223                   Bytes.toString(rowKey));
224               }
225               wl = getNewWriter(family, conf, null);
226             } else {
227               if (LOG.isDebugEnabled()) {
228                 LOG.debug("first rowkey: [" + Bytes.toString(rowKey) + "]");
229               }
230               InetSocketAddress initialIsa =
231                   new InetSocketAddress(loc.getHostname(), loc.getPort());
232               if (initialIsa.isUnresolved()) {
233                 if (LOG.isTraceEnabled()) {
234                   LOG.trace("failed to resolve bind address: " + loc.getHostname() + ":"
235                       + loc.getPort() + ", so use default writer");
236                 }
237                 wl = getNewWriter(family, conf, null);
238               } else {
239                 if(LOG.isDebugEnabled()) {
240                   LOG.debug("use favored nodes writer: " + initialIsa.getHostName());
241                 }
242                 wl = getNewWriter(family, conf, new InetSocketAddress[] { initialIsa });
243               }
244             }
245           } else {
246             wl = getNewWriter(family, conf, null);
247           }
248         }
249 
250         // we now have the proper HLog writer. full steam ahead
251         kv.updateLatestStamp(this.now);
252         wl.writer.append(kv);
253         wl.written += length;
254 
255         // Copy the row so we know when a row transition.
256         this.previousRow = rowKey;
257       }
258 
259       private void rollWriters() throws IOException {
260         for (WriterLength wl : this.writers.values()) {
261           if (wl.writer != null) {
262             LOG.info("Writer=" + wl.writer.getPath() +
263                 ((wl.written == 0)? "": ", wrote=" + wl.written));
264             close(wl.writer);
265           }
266           wl.writer = null;
267           wl.written = 0;
268         }
269         this.rollRequested = false;
270       }
271 
272       /* Create a new StoreFile.Writer.
273        * @param family
274        * @return A WriterLength, containing a new StoreFile.Writer.
275        * @throws IOException
276        */
277       private WriterLength getNewWriter(byte[] family, Configuration conf,
278           InetSocketAddress[] favoredNodes) throws IOException {
279         WriterLength wl = new WriterLength();
280         Path familydir = new Path(outputdir, Bytes.toString(family));
281         Algorithm compression = compressionMap.get(family);
282         compression = compression == null ? defaultCompression : compression;
283         BloomType bloomType = bloomTypeMap.get(family);
284         bloomType = bloomType == null ? BloomType.NONE : bloomType;
285         Integer blockSize = blockSizeMap.get(family);
286         blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize;
287         DataBlockEncoding encoding = overriddenEncoding;
288         encoding = encoding == null ? datablockEncodingMap.get(family) : encoding;
289         encoding = encoding == null ? DataBlockEncoding.NONE : encoding;
290         Configuration tempConf = new Configuration(conf);
291         tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
292         HFileContextBuilder contextBuilder = new HFileContextBuilder()
293                                     .withCompression(compression)
294                                     .withChecksumType(HStore.getChecksumType(conf))
295                                     .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf))
296                                     .withBlockSize(blockSize);
297         contextBuilder.withDataBlockEncoding(encoding);
298         HFileContext hFileContext = contextBuilder.build();
299                                     
300         if (null == favoredNodes) {
301           wl.writer =
302               new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs)
303                   .withOutputDir(familydir).withBloomType(bloomType)
304                   .withComparator(KeyValue.COMPARATOR).withFileContext(hFileContext).build();
305         } else {
306           wl.writer =
307               new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), new HFileSystem(fs))
308                   .withOutputDir(familydir).withBloomType(bloomType)
309                   .withComparator(KeyValue.COMPARATOR).withFileContext(hFileContext)
310                   .withFavoredNodes(favoredNodes).build();
311         }
312 
313         this.writers.put(family, wl);
314         return wl;
315       }
316 
317       private void close(final StoreFile.Writer w) throws IOException {
318         if (w != null) {
319           w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY,
320               Bytes.toBytes(System.currentTimeMillis()));
321           w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
322               Bytes.toBytes(context.getTaskAttemptID().toString()));
323           w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY,
324               Bytes.toBytes(true));
325           w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY,
326               Bytes.toBytes(compactionExclude));
327           w.appendTrackedTimestampsToMetadata();
328           w.close();
329         }
330       }
331 
332       @Override
333       public void close(TaskAttemptContext c)
334       throws IOException, InterruptedException {
335         for (WriterLength wl: this.writers.values()) {
336           close(wl.writer);
337         }
338       }
339     };
340   }
341 
342   /*
343    * Data structure to hold a Writer and amount of data written on it.
344    */
345   static class WriterLength {
346     long written = 0;
347     StoreFile.Writer writer = null;
348   }
349 
350   /**
351    * Return the start keys of all of the regions in this table,
352    * as a list of ImmutableBytesWritable.
353    */
354   private static List<ImmutableBytesWritable> getRegionStartKeys(HTable table)
355   throws IOException {
356     byte[][] byteKeys = table.getStartKeys();
357     ArrayList<ImmutableBytesWritable> ret =
358       new ArrayList<ImmutableBytesWritable>(byteKeys.length);
359     for (byte[] byteKey : byteKeys) {
360       ret.add(new ImmutableBytesWritable(byteKey));
361     }
362     return ret;
363   }
364 
365   /**
366    * Write out a {@link SequenceFile} that can be read by
367    * {@link TotalOrderPartitioner} that contains the split points in startKeys.
368    */
369   @SuppressWarnings("deprecation")
370   private static void writePartitions(Configuration conf, Path partitionsPath,
371       List<ImmutableBytesWritable> startKeys) throws IOException {
372     LOG.info("Writing partition information to " + partitionsPath);
373     if (startKeys.isEmpty()) {
374       throw new IllegalArgumentException("No regions passed");
375     }
376 
377     // We're generating a list of split points, and we don't ever
378     // have keys < the first region (which has an empty start key)
379     // so we need to remove it. Otherwise we would end up with an
380     // empty reducer with index 0
381     TreeSet<ImmutableBytesWritable> sorted =
382       new TreeSet<ImmutableBytesWritable>(startKeys);
383 
384     ImmutableBytesWritable first = sorted.first();
385     if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
386       throw new IllegalArgumentException(
387           "First region of table should have empty start key. Instead has: "
388           + Bytes.toStringBinary(first.get()));
389     }
390     sorted.remove(first);
391 
392     // Write the actual file
393     FileSystem fs = partitionsPath.getFileSystem(conf);
394     SequenceFile.Writer writer = SequenceFile.createWriter(
395       fs, conf, partitionsPath, ImmutableBytesWritable.class,
396       NullWritable.class);
397 
398     try {
399       for (ImmutableBytesWritable startKey : sorted) {
400         writer.append(startKey, NullWritable.get());
401       }
402     } finally {
403       writer.close();
404     }
405   }
406 
407   /**
408    * Configure a MapReduce Job to perform an incremental load into the given
409    * table. This
410    * <ul>
411    *   <li>Inspects the table to configure a total order partitioner</li>
412    *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
413    *   <li>Sets the number of reduce tasks to match the current number of regions</li>
414    *   <li>Sets the output key/value class to match HFileOutputFormat2's requirements</li>
415    *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
416    *     PutSortReducer)</li>
417    * </ul>
418    * The user should be sure to set the map output value class to either KeyValue or Put before
419    * running this function.
420    */
421   public static void configureIncrementalLoad(Job job, HTable table)
422       throws IOException {
423     configureIncrementalLoad(job, table, HFileOutputFormat2.class);
424   }
425 
426   static void configureIncrementalLoad(Job job, HTable table,
427       Class<? extends OutputFormat<?, ?>> cls) throws IOException {
428     Configuration conf = job.getConfiguration();
429 
430     job.setOutputKeyClass(ImmutableBytesWritable.class);
431     job.setOutputValueClass(KeyValue.class);
432     job.setOutputFormatClass(cls);
433 
434     // Based on the configured map output class, set the correct reducer to properly
435     // sort the incoming values.
436     // TODO it would be nice to pick one or the other of these formats.
437     if (KeyValue.class.equals(job.getMapOutputValueClass())) {
438       job.setReducerClass(KeyValueSortReducer.class);
439     } else if (Put.class.equals(job.getMapOutputValueClass())) {
440       job.setReducerClass(PutSortReducer.class);
441     } else if (Text.class.equals(job.getMapOutputValueClass())) {
442       job.setReducerClass(TextSortReducer.class);
443     } else {
444       LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
445     }
446 
447     conf.setStrings("io.serializations", conf.get("io.serializations"),
448         MutationSerialization.class.getName(), ResultSerialization.class.getName(),
449         KeyValueSerialization.class.getName());
450 
451     if (conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) {
452       // record this table name for creating writer by favored nodes
453       LOG.info("bulkload locality sensitive enabled");
454       conf.set(OUTPUT_TABLE_NAME_CONF_KEY, table.getName().getNameAsString());
455     }
456 
457     // Use table's region boundaries for TOP split points.
458     LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName()));
459     List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
460     LOG.info("Configuring " + startKeys.size() + " reduce partitions " +
461         "to match current region count");
462     job.setNumReduceTasks(startKeys.size());
463 
464     configurePartitioner(job, startKeys);
465     // Set compression algorithms based on column families
466     configureCompression(table, conf);
467     configureBloomType(table, conf);
468     configureBlockSize(table, conf);
469     configureDataBlockEncoding(table, conf);
470 
471     TableMapReduceUtil.addDependencyJars(job);
472     TableMapReduceUtil.initCredentials(job);
473     LOG.info("Incremental table " + Bytes.toString(table.getTableName())
474       + " output configured.");
475   }
476   
477   public static void configureIncrementalLoadMap(Job job, HTable table) throws IOException {
478     Configuration conf = job.getConfiguration();
479 
480     job.setOutputKeyClass(ImmutableBytesWritable.class);
481     job.setOutputValueClass(KeyValue.class);
482     job.setOutputFormatClass(HFileOutputFormat2.class);
483 
484     // Set compression algorithms based on column families
485     configureCompression(table, conf);
486     configureBloomType(table, conf);
487     configureBlockSize(table, conf);
488     configureDataBlockEncoding(table, conf);
489 
490     TableMapReduceUtil.addDependencyJars(job);
491     TableMapReduceUtil.initCredentials(job);
492     LOG.info("Incremental table " + table.getName() + " output configured.");
493   }
494 
495   /**
496    * Runs inside the task to deserialize column family to compression algorithm
497    * map from the configuration.
498    *
499    * @param conf to read the serialized values from
500    * @return a map from column family to the configured compression algorithm
501    */
502   @VisibleForTesting
503   static Map<byte[], Algorithm> createFamilyCompressionMap(Configuration
504       conf) {
505     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
506         COMPRESSION_FAMILIES_CONF_KEY);
507     Map<byte[], Algorithm> compressionMap = new TreeMap<byte[],
508         Algorithm>(Bytes.BYTES_COMPARATOR);
509     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
510       Algorithm algorithm = AbstractHFileWriter.compressionByName
511           (e.getValue());
512       compressionMap.put(e.getKey(), algorithm);
513     }
514     return compressionMap;
515   }
516 
517   /**
518    * Runs inside the task to deserialize column family to bloom filter type
519    * map from the configuration.
520    *
521    * @param conf to read the serialized values from
522    * @return a map from column family to the the configured bloom filter type
523    */
524   @VisibleForTesting
525   static Map<byte[], BloomType> createFamilyBloomTypeMap(Configuration conf) {
526     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
527         BLOOM_TYPE_FAMILIES_CONF_KEY);
528     Map<byte[], BloomType> bloomTypeMap = new TreeMap<byte[],
529         BloomType>(Bytes.BYTES_COMPARATOR);
530     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
531       BloomType bloomType = BloomType.valueOf(e.getValue());
532       bloomTypeMap.put(e.getKey(), bloomType);
533     }
534     return bloomTypeMap;
535   }
536 
537   /**
538    * Runs inside the task to deserialize column family to block size
539    * map from the configuration.
540    *
541    * @param conf to read the serialized values from
542    * @return a map from column family to the configured block size
543    */
544   @VisibleForTesting
545   static Map<byte[], Integer> createFamilyBlockSizeMap(Configuration conf) {
546     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
547         BLOCK_SIZE_FAMILIES_CONF_KEY);
548     Map<byte[], Integer> blockSizeMap = new TreeMap<byte[],
549         Integer>(Bytes.BYTES_COMPARATOR);
550     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
551       Integer blockSize = Integer.parseInt(e.getValue());
552       blockSizeMap.put(e.getKey(), blockSize);
553     }
554     return blockSizeMap;
555   }
556 
557   /**
558    * Runs inside the task to deserialize column family to data block encoding
559    * type map from the configuration.
560    *
561    * @param conf to read the serialized values from
562    * @return a map from column family to HFileDataBlockEncoder for the
563    *         configured data block type for the family
564    */
565   @VisibleForTesting
566   static Map<byte[], DataBlockEncoding> createFamilyDataBlockEncodingMap(
567       Configuration conf) {
568     Map<byte[], String> stringMap = createFamilyConfValueMap(conf,
569         DATABLOCK_ENCODING_FAMILIES_CONF_KEY);
570     Map<byte[], DataBlockEncoding> encoderMap = new TreeMap<byte[],
571         DataBlockEncoding>(Bytes.BYTES_COMPARATOR);
572     for (Map.Entry<byte[], String> e : stringMap.entrySet()) {
573       encoderMap.put(e.getKey(), DataBlockEncoding.valueOf((e.getValue())));
574     }
575     return encoderMap;
576   }
577 
578 
579   /**
580    * Run inside the task to deserialize column family to given conf value map.
581    *
582    * @param conf to read the serialized values from
583    * @param confName conf key to read from the configuration
584    * @return a map of column family to the given configuration value
585    */
586   private static Map<byte[], String> createFamilyConfValueMap(
587       Configuration conf, String confName) {
588     Map<byte[], String> confValMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
589     String confVal = conf.get(confName, "");
590     for (String familyConf : confVal.split("&")) {
591       String[] familySplit = familyConf.split("=");
592       if (familySplit.length != 2) {
593         continue;
594       }
595       try {
596         confValMap.put(URLDecoder.decode(familySplit[0], "UTF-8").getBytes(),
597             URLDecoder.decode(familySplit[1], "UTF-8"));
598       } catch (UnsupportedEncodingException e) {
599         // will not happen with UTF-8 encoding
600         throw new AssertionError(e);
601       }
602     }
603     return confValMap;
604   }
605 
606   /**
607    * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against
608    * <code>splitPoints</code>. Cleans up the partitions file after job exists.
609    */
610   static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints)
611       throws IOException {
612     Configuration conf = job.getConfiguration();
613     // create the partitions file
614     FileSystem fs = FileSystem.get(conf);
615     String hbaseTmpFsDir =
616         conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY,
617           HConstants.DEFAULT_TEMPORARY_HDFS_DIRECTORY);
618     Path partitionsPath = new Path(hbaseTmpFsDir, "partitions_" + UUID.randomUUID());
619     fs.makeQualified(partitionsPath);
620     writePartitions(conf, partitionsPath, splitPoints);
621     fs.deleteOnExit(partitionsPath);
622 
623     // configure job to use it
624     job.setPartitionerClass(TotalOrderPartitioner.class);
625     TotalOrderPartitioner.setPartitionFile(conf, partitionsPath);
626   }
627 
628   /**
629    * Serialize column family to compression algorithm map to configuration.
630    * Invoked while configuring the MR job for incremental load.
631    *
632    * @param table to read the properties from
633    * @param conf to persist serialized values into
634    * @throws IOException
635    *           on failure to read column family descriptors
636    */
637   @edu.umd.cs.findbugs.annotations.SuppressWarnings(
638       value="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE")
639   @VisibleForTesting
640   static void configureCompression(
641       HTable table, Configuration conf) throws IOException {
642     StringBuilder compressionConfigValue = new StringBuilder();
643     HTableDescriptor tableDescriptor = table.getTableDescriptor();
644     if(tableDescriptor == null){
645       // could happen with mock table instance
646       return;
647     }
648     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
649     int i = 0;
650     for (HColumnDescriptor familyDescriptor : families) {
651       if (i++ > 0) {
652         compressionConfigValue.append('&');
653       }
654       compressionConfigValue.append(URLEncoder.encode(
655         familyDescriptor.getNameAsString(), "UTF-8"));
656       compressionConfigValue.append('=');
657       compressionConfigValue.append(URLEncoder.encode(
658         familyDescriptor.getCompression().getName(), "UTF-8"));
659     }
660     // Get rid of the last ampersand
661     conf.set(COMPRESSION_FAMILIES_CONF_KEY, compressionConfigValue.toString());
662   }
663 
664   /**
665    * Serialize column family to block size map to configuration.
666    * Invoked while configuring the MR job for incremental load.
667    *
668    * @param table to read the properties from
669    * @param conf to persist serialized values into
670    * @throws IOException
671    *           on failure to read column family descriptors
672    */
673   @VisibleForTesting
674   static void configureBlockSize(
675       HTable table, Configuration conf) throws IOException {
676     StringBuilder blockSizeConfigValue = new StringBuilder();
677     HTableDescriptor tableDescriptor = table.getTableDescriptor();
678     if (tableDescriptor == null) {
679       // could happen with mock table instance
680       return;
681     }
682     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
683     int i = 0;
684     for (HColumnDescriptor familyDescriptor : families) {
685       if (i++ > 0) {
686         blockSizeConfigValue.append('&');
687       }
688       blockSizeConfigValue.append(URLEncoder.encode(
689           familyDescriptor.getNameAsString(), "UTF-8"));
690       blockSizeConfigValue.append('=');
691       blockSizeConfigValue.append(URLEncoder.encode(
692           String.valueOf(familyDescriptor.getBlocksize()), "UTF-8"));
693     }
694     // Get rid of the last ampersand
695     conf.set(BLOCK_SIZE_FAMILIES_CONF_KEY, blockSizeConfigValue.toString());
696   }
697 
698   /**
699    * Serialize column family to bloom type map to configuration.
700    * Invoked while configuring the MR job for incremental load.
701    *
702    * @param table to read the properties from
703    * @param conf to persist serialized values into
704    * @throws IOException
705    *           on failure to read column family descriptors
706    */
707   @VisibleForTesting
708   static void configureBloomType(
709       HTable table, Configuration conf) throws IOException {
710     HTableDescriptor tableDescriptor = table.getTableDescriptor();
711     if (tableDescriptor == null) {
712       // could happen with mock table instance
713       return;
714     }
715     StringBuilder bloomTypeConfigValue = new StringBuilder();
716     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
717     int i = 0;
718     for (HColumnDescriptor familyDescriptor : families) {
719       if (i++ > 0) {
720         bloomTypeConfigValue.append('&');
721       }
722       bloomTypeConfigValue.append(URLEncoder.encode(
723         familyDescriptor.getNameAsString(), "UTF-8"));
724       bloomTypeConfigValue.append('=');
725       String bloomType = familyDescriptor.getBloomFilterType().toString();
726       if (bloomType == null) {
727         bloomType = HColumnDescriptor.DEFAULT_BLOOMFILTER;
728       }
729       bloomTypeConfigValue.append(URLEncoder.encode(bloomType, "UTF-8"));
730     }
731     conf.set(BLOOM_TYPE_FAMILIES_CONF_KEY, bloomTypeConfigValue.toString());
732   }
733 
734   /**
735    * Serialize column family to data block encoding map to configuration.
736    * Invoked while configuring the MR job for incremental load.
737    *
738    * @param table to read the properties from
739    * @param conf to persist serialized values into
740    * @throws IOException
741    *           on failure to read column family descriptors
742    */
743   @VisibleForTesting
744   static void configureDataBlockEncoding(HTable table,
745       Configuration conf) throws IOException {
746     HTableDescriptor tableDescriptor = table.getTableDescriptor();
747     if (tableDescriptor == null) {
748       // could happen with mock table instance
749       return;
750     }
751     StringBuilder dataBlockEncodingConfigValue = new StringBuilder();
752     Collection<HColumnDescriptor> families = tableDescriptor.getFamilies();
753     int i = 0;
754     for (HColumnDescriptor familyDescriptor : families) {
755       if (i++ > 0) {
756         dataBlockEncodingConfigValue.append('&');
757       }
758       dataBlockEncodingConfigValue.append(
759           URLEncoder.encode(familyDescriptor.getNameAsString(), "UTF-8"));
760       dataBlockEncodingConfigValue.append('=');
761       DataBlockEncoding encoding = familyDescriptor.getDataBlockEncoding();
762       if (encoding == null) {
763         encoding = DataBlockEncoding.NONE;
764       }
765       dataBlockEncodingConfigValue.append(URLEncoder.encode(encoding.toString(),
766           "UTF-8"));
767     }
768     conf.set(DATABLOCK_ENCODING_FAMILIES_CONF_KEY,
769         dataBlockEncodingConfigValue.toString());
770   }
771 }