View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.io.hfile;
20  
21  import java.io.ByteArrayInputStream;
22  import java.io.Closeable;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.DataOutputStream;
26  import java.io.IOException;
27  import java.io.SequenceInputStream;
28  import java.net.InetSocketAddress;
29  import java.nio.ByteBuffer;
30  import java.util.ArrayList;
31  import java.util.Collection;
32  import java.util.Comparator;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.Set;
36  import java.util.SortedMap;
37  import java.util.TreeMap;
38  import java.util.concurrent.ArrayBlockingQueue;
39  import java.util.concurrent.BlockingQueue;
40  import java.util.concurrent.atomic.AtomicInteger;
41  import java.util.concurrent.atomic.AtomicLong;
42  
43  import com.google.protobuf.HBaseZeroCopyByteString;
44  import org.apache.commons.logging.Log;
45  import org.apache.commons.logging.LogFactory;
46  import org.apache.hadoop.classification.InterfaceAudience;
47  import org.apache.hadoop.conf.Configuration;
48  import org.apache.hadoop.fs.FSDataInputStream;
49  import org.apache.hadoop.fs.FSDataOutputStream;
50  import org.apache.hadoop.fs.FileStatus;
51  import org.apache.hadoop.fs.FileSystem;
52  import org.apache.hadoop.fs.Path;
53  import org.apache.hadoop.fs.PathFilter;
54  import org.apache.hadoop.hbase.HColumnDescriptor;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.KeyValue;
57  import org.apache.hadoop.hbase.KeyValue.KVComparator;
58  import org.apache.hadoop.hbase.fs.HFileSystem;
59  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
60  import org.apache.hadoop.hbase.io.compress.Compression;
61  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
62  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
63  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
64  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.BytesBytesPair;
65  import org.apache.hadoop.hbase.protobuf.generated.HFileProtos;
66  import org.apache.hadoop.hbase.util.BloomFilterWriter;
67  import org.apache.hadoop.hbase.util.Bytes;
68  import org.apache.hadoop.hbase.util.ChecksumType;
69  import org.apache.hadoop.hbase.util.FSUtils;
70  import org.apache.hadoop.io.Writable;
71  
72  import com.google.common.base.Preconditions;
73  import com.google.common.collect.Lists;
74  
75  /**
76   * File format for hbase.
77   * A file of sorted key/value pairs. Both keys and values are byte arrays.
78   * <p>
79   * The memory footprint of a HFile includes the following (below is taken from the
80   * <a
81   * href=https://issues.apache.org/jira/browse/HADOOP-3315>TFile</a> documentation
82   * but applies also to HFile):
83   * <ul>
84   * <li>Some constant overhead of reading or writing a compressed block.
85   * <ul>
86   * <li>Each compressed block requires one compression/decompression codec for
87   * I/O.
88   * <li>Temporary space to buffer the key.
89   * <li>Temporary space to buffer the value.
90   * </ul>
91   * <li>HFile index, which is proportional to the total number of Data Blocks.
92   * The total amount of memory needed to hold the index can be estimated as
93   * (56+AvgKeySize)*NumBlocks.
94   * </ul>
95   * Suggestions on performance optimization.
96   * <ul>
97   * <li>Minimum block size. We recommend a setting of minimum block size between
98   * 8KB to 1MB for general usage. Larger block size is preferred if files are
99   * primarily for sequential access. However, it would lead to inefficient random
100  * access (because there are more data to decompress). Smaller blocks are good
101  * for random access, but require more memory to hold the block index, and may
102  * be slower to create (because we must flush the compressor stream at the
103  * conclusion of each data block, which leads to an FS I/O flush). Further, due
104  * to the internal caching in Compression codec, the smallest possible block
105  * size would be around 20KB-30KB.
106  * <li>The current implementation does not offer true multi-threading for
107  * reading. The implementation uses FSDataInputStream seek()+read(), which is
108  * shown to be much faster than positioned-read call in single thread mode.
109  * However, it also means that if multiple threads attempt to access the same
110  * HFile (using multiple scanners) simultaneously, the actual I/O is carried out
111  * sequentially even if they access different DFS blocks (Reexamine! pread seems
112  * to be 10% faster than seek+read in my testing -- stack).
113  * <li>Compression codec. Use "none" if the data is not very compressable (by
114  * compressable, I mean a compression ratio at least 2:1). Generally, use "lzo"
115  * as the starting point for experimenting. "gz" overs slightly better
116  * compression ratio over "lzo" but requires 4x CPU to compress and 2x CPU to
117  * decompress, comparing to "lzo".
118  * </ul>
119  *
120  * For more on the background behind HFile, see <a
121  * href=https://issues.apache.org/jira/browse/HBASE-61>HBASE-61</a>.
122  * <p>
123  * File is made of data blocks followed by meta data blocks (if any), a fileinfo
124  * block, data block index, meta data block index, and a fixed size trailer
125  * which records the offsets at which file changes content type.
126  * <pre>&lt;data blocks>&lt;meta blocks>&lt;fileinfo>&lt;data index>&lt;meta index>&lt;trailer></pre>
127  * Each block has a bit of magic at its start.  Block are comprised of
128  * key/values.  In data blocks, they are both byte arrays.  Metadata blocks are
129  * a String key and a byte array value.  An empty file looks like this:
130  * <pre>&lt;fileinfo>&lt;trailer></pre>.  That is, there are not data nor meta
131  * blocks present.
132  * <p>
133  * TODO: Do scanners need to be able to take a start and end row?
134  * TODO: Should BlockIndex know the name of its file?  Should it have a Path
135  * that points at its file say for the case where an index lives apart from
136  * an HFile instance?
137  */
138 @InterfaceAudience.Private
139 public class HFile {
140   static final Log LOG = LogFactory.getLog(HFile.class);
141 
142   /**
143    * Maximum length of key in HFile.
144    */
145   public final static int MAXIMUM_KEY_LENGTH = Integer.MAX_VALUE;
146 
147   /**
148    * Default compression: none.
149    */
150   public final static Compression.Algorithm DEFAULT_COMPRESSION_ALGORITHM =
151     Compression.Algorithm.NONE;
152 
153   /** Minimum supported HFile format version */
154   public static final int MIN_FORMAT_VERSION = 2;
155 
156   /** Maximum supported HFile format version
157    */
158   public static final int MAX_FORMAT_VERSION = 2;
159 
160   /** Default compression name: none. */
161   public final static String DEFAULT_COMPRESSION =
162     DEFAULT_COMPRESSION_ALGORITHM.getName();
163 
164   /** Meta data block name for bloom filter bits. */
165   public static final String BLOOM_FILTER_DATA_KEY = "BLOOM_FILTER_DATA";
166 
167   /**
168    * We assume that HFile path ends with
169    * ROOT_DIR/TABLE_NAME/REGION_NAME/CF_NAME/HFILE, so it has at least this
170    * many levels of nesting. This is needed for identifying table and CF name
171    * from an HFile path.
172    */
173   public final static int MIN_NUM_HFILE_PATH_LEVELS = 5;
174 
175   /**
176    * The number of bytes per checksum.
177    */
178   public static final int DEFAULT_BYTES_PER_CHECKSUM = 16 * 1024;
179   public static final ChecksumType DEFAULT_CHECKSUM_TYPE = ChecksumType.CRC32;
180 
181   // For measuring latency of "sequential" reads and writes
182   private static final AtomicInteger readOps = new AtomicInteger();
183   private static final AtomicLong readTimeNano = new AtomicLong();
184   private static final AtomicInteger writeOps = new AtomicInteger();
185   private static final AtomicLong writeTimeNano = new AtomicLong();
186 
187   // For measuring latency of pread
188   private static final AtomicInteger preadOps = new AtomicInteger();
189   private static final AtomicLong preadTimeNano = new AtomicLong();
190 
191   // For measuring number of checksum failures
192   static final AtomicLong checksumFailures = new AtomicLong();
193 
194   // For getting more detailed stats on FS latencies
195   // If, for some reason, the metrics subsystem stops polling for latencies,
196   // I don't want data to pile up in a memory leak
197   // so, after LATENCY_BUFFER_SIZE items have been enqueued for processing,
198   // fs latency stats will be dropped (and this behavior will be logged)
199   private static final int LATENCY_BUFFER_SIZE = 5000;
200   private static final BlockingQueue<Long> fsReadLatenciesNanos =
201       new ArrayBlockingQueue<Long>(LATENCY_BUFFER_SIZE);
202   private static final BlockingQueue<Long> fsWriteLatenciesNanos =
203       new ArrayBlockingQueue<Long>(LATENCY_BUFFER_SIZE);
204   private static final BlockingQueue<Long> fsPreadLatenciesNanos =
205       new ArrayBlockingQueue<Long>(LATENCY_BUFFER_SIZE);
206 
207   public static final void offerReadLatency(long latencyNanos, boolean pread) {
208     if (pread) {
209       fsPreadLatenciesNanos.offer(latencyNanos); // might be silently dropped, if the queue is full
210       preadOps.incrementAndGet();
211       preadTimeNano.addAndGet(latencyNanos);
212     } else {
213       fsReadLatenciesNanos.offer(latencyNanos); // might be silently dropped, if the queue is full
214       readTimeNano.addAndGet(latencyNanos);
215       readOps.incrementAndGet();
216     }
217   }
218 
219   public static final void offerWriteLatency(long latencyNanos) {
220     fsWriteLatenciesNanos.offer(latencyNanos); // might be silently dropped, if the queue is full
221 
222     writeTimeNano.addAndGet(latencyNanos);
223     writeOps.incrementAndGet();
224   }
225 
226   public static final Collection<Long> getReadLatenciesNanos() {
227     final List<Long> latencies =
228         Lists.newArrayListWithCapacity(fsReadLatenciesNanos.size());
229     fsReadLatenciesNanos.drainTo(latencies);
230     return latencies;
231   }
232 
233   public static final Collection<Long> getPreadLatenciesNanos() {
234     final List<Long> latencies =
235         Lists.newArrayListWithCapacity(fsPreadLatenciesNanos.size());
236     fsPreadLatenciesNanos.drainTo(latencies);
237     return latencies;
238   }
239 
240   public static final Collection<Long> getWriteLatenciesNanos() {
241     final List<Long> latencies =
242         Lists.newArrayListWithCapacity(fsWriteLatenciesNanos.size());
243     fsWriteLatenciesNanos.drainTo(latencies);
244     return latencies;
245   }
246 
247   // for test purpose
248   public static volatile AtomicLong dataBlockReadCnt = new AtomicLong(0);
249 
250   // number of sequential reads
251   public static final int getReadOps() {
252     return readOps.getAndSet(0);
253   }
254 
255   public static final long getReadTimeMs() {
256     return readTimeNano.getAndSet(0) / 1000000;
257   }
258 
259   // number of positional reads
260   public static final int getPreadOps() {
261     return preadOps.getAndSet(0);
262   }
263 
264   public static final long getPreadTimeMs() {
265     return preadTimeNano.getAndSet(0) / 1000000;
266   }
267 
268   public static final int getWriteOps() {
269     return writeOps.getAndSet(0);
270   }
271 
272   public static final long getWriteTimeMs() {
273     return writeTimeNano.getAndSet(0) / 1000000;
274   }
275 
276   /**
277    * Number of checksum verification failures. It also
278    * clears the counter.
279    */
280   public static final long getChecksumFailuresCount() {
281     return checksumFailures.getAndSet(0);
282   }
283 
284   /** API required to write an {@link HFile} */
285   public interface Writer extends Closeable {
286 
287     /** Add an element to the file info map. */
288     void appendFileInfo(byte[] key, byte[] value) throws IOException;
289 
290     void append(KeyValue kv) throws IOException;
291 
292     void append(byte[] key, byte[] value) throws IOException;
293 
294     /** @return the path to this {@link HFile} */
295     Path getPath();
296 
297     /**
298      * Adds an inline block writer such as a multi-level block index writer or
299      * a compound Bloom filter writer.
300      */
301     void addInlineBlockWriter(InlineBlockWriter bloomWriter);
302 
303     // The below three methods take Writables.  We'd like to undo Writables but undoing the below would be pretty
304     // painful.  Could take a byte [] or a Message but we want to be backward compatible around hfiles so would need
305     // to map between Message and Writable or byte [] and current Writable serialization.  This would be a bit of work
306     // to little gain.  Thats my thinking at moment.  St.Ack 20121129
307 
308     void appendMetaBlock(String bloomFilterMetaKey, Writable metaWriter);
309 
310     /**
311      * Store general Bloom filter in the file. This does not deal with Bloom filter
312      * internals but is necessary, since Bloom filters are stored differently
313      * in HFile version 1 and version 2.
314      */
315     void addGeneralBloomFilter(BloomFilterWriter bfw);
316 
317     /**
318      * Store delete family Bloom filter in the file, which is only supported in
319      * HFile V2.
320      */
321     void addDeleteFamilyBloomFilter(BloomFilterWriter bfw) throws IOException;
322   }
323 
324   /**
325    * This variety of ways to construct writers is used throughout the code, and
326    * we want to be able to swap writer implementations.
327    */
328   public static abstract class WriterFactory {
329     protected final Configuration conf;
330     protected final CacheConfig cacheConf;
331     protected FileSystem fs;
332     protected Path path;
333     protected FSDataOutputStream ostream;
334     protected int blockSize = HColumnDescriptor.DEFAULT_BLOCKSIZE;
335     protected Compression.Algorithm compression =
336         HFile.DEFAULT_COMPRESSION_ALGORITHM;
337     protected HFileDataBlockEncoder encoder = NoOpDataBlockEncoder.INSTANCE;
338     protected KVComparator comparator = KeyValue.COMPARATOR;
339     protected InetSocketAddress[] favoredNodes;
340     protected ChecksumType checksumType = HFile.DEFAULT_CHECKSUM_TYPE;
341     protected int bytesPerChecksum = DEFAULT_BYTES_PER_CHECKSUM;
342     protected boolean includeMVCCReadpoint = true;
343 
344     WriterFactory(Configuration conf, CacheConfig cacheConf) {
345       this.conf = conf;
346       this.cacheConf = cacheConf;
347     }
348 
349     public WriterFactory withPath(FileSystem fs, Path path) {
350       Preconditions.checkNotNull(fs);
351       Preconditions.checkNotNull(path);
352       this.fs = fs;
353       this.path = path;
354       return this;
355     }
356 
357     public WriterFactory withOutputStream(FSDataOutputStream ostream) {
358       Preconditions.checkNotNull(ostream);
359       this.ostream = ostream;
360       return this;
361     }
362 
363     public WriterFactory withBlockSize(int blockSize) {
364       this.blockSize = blockSize;
365       return this;
366     }
367 
368     public WriterFactory withCompression(Compression.Algorithm compression) {
369       Preconditions.checkNotNull(compression);
370       this.compression = compression;
371       return this;
372     }
373 
374     public WriterFactory withCompression(String compressAlgo) {
375       Preconditions.checkNotNull(compression);
376       this.compression = AbstractHFileWriter.compressionByName(compressAlgo);
377       return this;
378     }
379 
380     public WriterFactory withDataBlockEncoder(HFileDataBlockEncoder encoder) {
381       Preconditions.checkNotNull(encoder);
382       this.encoder = encoder;
383       return this;
384     }
385 
386     public WriterFactory withComparator(KVComparator comparator) {
387       Preconditions.checkNotNull(comparator);
388       this.comparator = comparator;
389       return this;
390     }
391 
392     public WriterFactory withFavoredNodes(InetSocketAddress[] favoredNodes) {
393       // Deliberately not checking for null here.
394       this.favoredNodes = favoredNodes;
395       return this;
396     }
397 
398     public WriterFactory withChecksumType(ChecksumType checksumType) {
399       Preconditions.checkNotNull(checksumType);
400       this.checksumType = checksumType;
401       return this;
402     }
403 
404     public WriterFactory withBytesPerChecksum(int bytesPerChecksum) {
405       this.bytesPerChecksum = bytesPerChecksum;
406       return this;
407     }
408 
409     /**
410      * @param includeMVCCReadpoint whether to write the mvcc readpoint to the file for each KV
411      * @return this (for chained invocation)
412      */
413     public WriterFactory includeMVCCReadpoint(boolean includeMVCCReadpoint) {
414       this.includeMVCCReadpoint = includeMVCCReadpoint;
415       return this;
416     }
417 
418     public Writer create() throws IOException {
419       if ((path != null ? 1 : 0) + (ostream != null ? 1 : 0) != 1) {
420         throw new AssertionError("Please specify exactly one of " +
421             "filesystem/path or path");
422       }
423       if (path != null) {
424         ostream = AbstractHFileWriter.createOutputStream(conf, fs, path, favoredNodes);
425       }
426       return createWriter(fs, path, ostream, blockSize,
427           compression, encoder, comparator, checksumType, bytesPerChecksum, includeMVCCReadpoint);
428     }
429 
430     protected abstract Writer createWriter(FileSystem fs, Path path,
431         FSDataOutputStream ostream, int blockSize,
432         Compression.Algorithm compress,
433         HFileDataBlockEncoder dataBlockEncoder,
434         KVComparator comparator, ChecksumType checksumType,
435         int bytesPerChecksum, boolean includeMVCCReadpoint) throws IOException;
436   }
437 
438   /** The configuration key for HFile version to use for new files */
439   public static final String FORMAT_VERSION_KEY = "hfile.format.version";
440 
441   public static int getFormatVersion(Configuration conf) {
442     int version = conf.getInt(FORMAT_VERSION_KEY, MAX_FORMAT_VERSION);
443     checkFormatVersion(version);
444     return version;
445   }
446 
447   /**
448    * Returns the factory to be used to create {@link HFile} writers.
449    * Disables block cache access for all writers created through the
450    * returned factory.
451    */
452   public static final WriterFactory getWriterFactoryNoCache(Configuration
453        conf) {
454     Configuration tempConf = new Configuration(conf);
455     tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
456     return HFile.getWriterFactory(conf, new CacheConfig(tempConf));
457   }
458 
459   /**
460    * Returns the factory to be used to create {@link HFile} writers
461    */
462   public static final WriterFactory getWriterFactory(Configuration conf,
463       CacheConfig cacheConf) {
464     int version = getFormatVersion(conf);
465     switch (version) {
466     case 2:
467       return new HFileWriterV2.WriterFactoryV2(conf, cacheConf);
468     default:
469       throw new IllegalArgumentException("Cannot create writer for HFile " +
470           "format version " + version);
471     }
472   }
473 
474   /** An abstraction used by the block index */
475   public interface CachingBlockReader {
476     HFileBlock readBlock(long offset, long onDiskBlockSize,
477         boolean cacheBlock, final boolean pread, final boolean isCompaction,
478         BlockType expectedBlockType)
479         throws IOException;
480   }
481 
482   /** An interface used by clients to open and iterate an {@link HFile}. */
483   public interface Reader extends Closeable, CachingBlockReader {
484     /**
485      * Returns this reader's "name". Usually the last component of the path.
486      * Needs to be constant as the file is being moved to support caching on
487      * write.
488      */
489     String getName();
490 
491     KVComparator getComparator();
492 
493     HFileScanner getScanner(boolean cacheBlocks,
494        final boolean pread, final boolean isCompaction);
495 
496     ByteBuffer getMetaBlock(String metaBlockName,
497        boolean cacheBlock) throws IOException;
498 
499     Map<byte[], byte[]> loadFileInfo() throws IOException;
500 
501     byte[] getLastKey();
502 
503     byte[] midkey() throws IOException;
504 
505     long length();
506 
507     long getEntries();
508 
509     byte[] getFirstKey();
510 
511     long indexSize();
512 
513     byte[] getFirstRowKey();
514 
515     byte[] getLastRowKey();
516 
517     FixedFileTrailer getTrailer();
518 
519     HFileBlockIndex.BlockIndexReader getDataBlockIndexReader();
520 
521     HFileScanner getScanner(boolean cacheBlocks, boolean pread);
522 
523     Compression.Algorithm getCompressionAlgorithm();
524 
525     /**
526      * Retrieves general Bloom filter metadata as appropriate for each
527      * {@link HFile} version.
528      * Knows nothing about how that metadata is structured.
529      */
530     DataInput getGeneralBloomFilterMetadata() throws IOException;
531 
532     /**
533      * Retrieves delete family Bloom filter metadata as appropriate for each
534      * {@link HFile}  version.
535      * Knows nothing about how that metadata is structured.
536      */
537     DataInput getDeleteBloomFilterMetadata() throws IOException;
538 
539     Path getPath();
540 
541     /** Close method with optional evictOnClose */
542     void close(boolean evictOnClose) throws IOException;
543 
544     DataBlockEncoding getDataBlockEncoding();
545 
546     boolean hasMVCCInfo();
547   }
548 
549   /**
550    * Method returns the reader given the specified arguments.
551    * TODO This is a bad abstraction.  See HBASE-6635.
552    *
553    * @param path hfile's path
554    * @param fsdis stream of path's file
555    * @param size max size of the trailer.
556    * @param cacheConf Cache configuation values, cannot be null.
557    * @param hfs
558    * @return an appropriate instance of HFileReader
559    * @throws IOException If file is invalid, will throw CorruptHFileException flavored IOException
560    */
561   private static Reader pickReaderVersion(Path path, FSDataInputStreamWrapper fsdis,
562       long size, CacheConfig cacheConf, HFileSystem hfs) throws IOException {
563     FixedFileTrailer trailer = null;
564     try {
565       boolean isHBaseChecksum = fsdis.shouldUseHBaseChecksum();
566       assert !isHBaseChecksum; // Initially we must read with FS checksum.
567       trailer = FixedFileTrailer.readFromStream(fsdis.getStream(isHBaseChecksum), size);
568       switch (trailer.getMajorVersion()) {
569       case 2:
570         return new HFileReaderV2(
571           path, trailer, fsdis, size, cacheConf, hfs);
572       default:
573         throw new CorruptHFileException("Invalid HFile version " + trailer.getMajorVersion());
574       }
575     } catch (Throwable t) {
576       try {
577         fsdis.close();
578       } catch (Throwable t2) {
579         LOG.warn("Error closing fsdis FSDataInputStreamWrapper", t2);
580       }
581       throw new CorruptHFileException("Problem reading HFile Trailer from file " + path, t);
582     }
583   }
584 
585   /**
586    * @param fs A file system
587    * @param path Path to HFile
588    * @param fsdis a stream of path's file
589    * @param size max size of the trailer.
590    * @param cacheConf Cache configuration for hfile's contents
591    * @return A version specific Hfile Reader
592    * @throws IOException If file is invalid, will throw CorruptHFileException flavored IOException
593    */
594   public static Reader createReader(FileSystem fs, Path path,
595       FSDataInputStreamWrapper fsdis, long size, CacheConfig cacheConf) throws IOException {
596     HFileSystem hfs = null;
597 
598     // If the fs is not an instance of HFileSystem, then create an
599     // instance of HFileSystem that wraps over the specified fs.
600     // In this case, we will not be able to avoid checksumming inside
601     // the filesystem.
602     if (!(fs instanceof HFileSystem)) {
603       hfs = new HFileSystem(fs);
604     } else {
605       hfs = (HFileSystem)fs;
606     }
607     return pickReaderVersion(path, fsdis, size, cacheConf, hfs);
608   }
609 
610   /**
611    *
612    * @param fs filesystem
613    * @param path Path to file to read
614    * @param cacheConf This must not be null.  @see {@link org.apache.hadoop.hbase.io.hfile.CacheConfig#CacheConfig(Configuration)}
615    * @return an active Reader instance
616    * @throws IOException Will throw a CorruptHFileException (DoNotRetryIOException subtype) if hfile is corrupt/invalid.
617    */
618   public static Reader createReader(
619       FileSystem fs, Path path, CacheConfig cacheConf) throws IOException {
620     Preconditions.checkNotNull(cacheConf, "Cannot create Reader with null CacheConf");
621     FSDataInputStreamWrapper stream = new FSDataInputStreamWrapper(fs, path);
622     return pickReaderVersion(path, stream, fs.getFileStatus(path).getLen(), cacheConf, null);
623   }
624 
625   /**
626    * This factory method is used only by unit tests
627    */
628   static Reader createReaderFromStream(Path path,
629       FSDataInputStream fsdis, long size, CacheConfig cacheConf)
630       throws IOException {
631     FSDataInputStreamWrapper wrapper = new FSDataInputStreamWrapper(fsdis);
632     return pickReaderVersion(path, wrapper, size, cacheConf, null);
633   }
634 
635   /**
636    * Metadata for this file.  Conjured by the writer.  Read in by the reader.
637    */
638   static class FileInfo implements SortedMap<byte [], byte []> {
639     static final String RESERVED_PREFIX = "hfile.";
640     static final byte[] RESERVED_PREFIX_BYTES = Bytes.toBytes(RESERVED_PREFIX);
641     static final byte [] LASTKEY = Bytes.toBytes(RESERVED_PREFIX + "LASTKEY");
642     static final byte [] AVG_KEY_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_KEY_LEN");
643     static final byte [] AVG_VALUE_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_VALUE_LEN");
644     static final byte [] COMPARATOR = Bytes.toBytes(RESERVED_PREFIX + "COMPARATOR");
645     private final SortedMap<byte [], byte []> map = new TreeMap<byte [], byte []>(Bytes.BYTES_COMPARATOR);
646 
647     public FileInfo() {
648       super();
649     }
650 
651     /**
652      * Append the given key/value pair to the file info, optionally checking the
653      * key prefix.
654      *
655      * @param k key to add
656      * @param v value to add
657      * @param checkPrefix whether to check that the provided key does not start
658      *          with the reserved prefix
659      * @return this file info object
660      * @throws IOException if the key or value is invalid
661      */
662     public FileInfo append(final byte[] k, final byte[] v,
663         final boolean checkPrefix) throws IOException {
664       if (k == null || v == null) {
665         throw new NullPointerException("Key nor value may be null");
666       }
667       if (checkPrefix && isReservedFileInfoKey(k)) {
668         throw new IOException("Keys with a " + FileInfo.RESERVED_PREFIX
669             + " are reserved");
670       }
671       put(k, v);
672       return this;
673     }
674 
675     public void clear() {
676       this.map.clear();
677     }
678 
679     public Comparator<? super byte[]> comparator() {
680       return map.comparator();
681     }
682 
683     public boolean containsKey(Object key) {
684       return map.containsKey(key);
685     }
686 
687     public boolean containsValue(Object value) {
688       return map.containsValue(value);
689     }
690 
691     public Set<java.util.Map.Entry<byte[], byte[]>> entrySet() {
692       return map.entrySet();
693     }
694 
695     public boolean equals(Object o) {
696       return map.equals(o);
697     }
698 
699     public byte[] firstKey() {
700       return map.firstKey();
701     }
702 
703     public byte[] get(Object key) {
704       return map.get(key);
705     }
706 
707     public int hashCode() {
708       return map.hashCode();
709     }
710 
711     public SortedMap<byte[], byte[]> headMap(byte[] toKey) {
712       return this.map.headMap(toKey);
713     }
714 
715     public boolean isEmpty() {
716       return map.isEmpty();
717     }
718 
719     public Set<byte[]> keySet() {
720       return map.keySet();
721     }
722 
723     public byte[] lastKey() {
724       return map.lastKey();
725     }
726 
727     public byte[] put(byte[] key, byte[] value) {
728       return this.map.put(key, value);
729     }
730 
731     public void putAll(Map<? extends byte[], ? extends byte[]> m) {
732       this.map.putAll(m);
733     }
734 
735     public byte[] remove(Object key) {
736       return this.map.remove(key);
737     }
738 
739     public int size() {
740       return map.size();
741     }
742 
743     public SortedMap<byte[], byte[]> subMap(byte[] fromKey, byte[] toKey) {
744       return this.map.subMap(fromKey, toKey);
745     }
746 
747     public SortedMap<byte[], byte[]> tailMap(byte[] fromKey) {
748       return this.map.tailMap(fromKey);
749     }
750 
751     public Collection<byte[]> values() {
752       return map.values();
753     }
754 
755     /**
756      * Write out this instance on the passed in <code>out</code> stream.
757      * We write it as a protobuf.
758      * @param out
759      * @throws IOException
760      * @see #read(DataInputStream)
761      */
762     void write(final DataOutputStream out) throws IOException {
763       HFileProtos.FileInfoProto.Builder builder = HFileProtos.FileInfoProto.newBuilder();
764       for (Map.Entry<byte [], byte[]> e: this.map.entrySet()) {
765         HBaseProtos.BytesBytesPair.Builder bbpBuilder = HBaseProtos.BytesBytesPair.newBuilder();
766         bbpBuilder.setFirst(HBaseZeroCopyByteString.wrap(e.getKey()));
767         bbpBuilder.setSecond(HBaseZeroCopyByteString.wrap(e.getValue()));
768         builder.addMapEntry(bbpBuilder.build());
769       }
770       out.write(ProtobufUtil.PB_MAGIC);
771       builder.build().writeDelimitedTo(out);
772     }
773 
774     /**
775      * Populate this instance with what we find on the passed in <code>in</code> stream.
776      * Can deserialize protobuf of old Writables format.
777      * @param in
778      * @throws IOException
779      * @see #write(DataOutputStream)
780      */
781     void read(final DataInputStream in) throws IOException {
782       // This code is tested over in TestHFileReaderV1 where we read an old hfile w/ this new code.
783       int pblen = ProtobufUtil.lengthOfPBMagic();
784       byte [] pbuf = new byte[pblen];
785       if (in.markSupported()) in.mark(pblen);
786       int read = in.read(pbuf);
787       if (read != pblen) throw new IOException("read=" + read + ", wanted=" + pblen);
788       if (ProtobufUtil.isPBMagicPrefix(pbuf)) {
789         parsePB(HFileProtos.FileInfoProto.parseDelimitedFrom(in));
790       } else {
791         if (in.markSupported()) {
792           in.reset();
793           parseWritable(in);
794         } else {
795           // We cannot use BufferedInputStream, it consumes more than we read from the underlying IS
796           ByteArrayInputStream bais = new ByteArrayInputStream(pbuf);
797           SequenceInputStream sis = new SequenceInputStream(bais, in); // Concatenate input streams
798           // TODO: Am I leaking anything here wrapping the passed in stream?  We are not calling close on the wrapped
799           // streams but they should be let go after we leave this context?  I see that we keep a reference to the
800           // passed in inputstream but since we no longer have a reference to this after we leave, we should be ok.
801           parseWritable(new DataInputStream(sis));
802         }
803       }
804     }
805 
806     /** Now parse the old Writable format.  It was a list of Map entries.  Each map entry was a key and a value of
807      * a byte [].  The old map format had a byte before each entry that held a code which was short for the key or
808      * value type.  We know it was a byte [] so in below we just read and dump it.
809      * @throws IOException
810      */
811     void parseWritable(final DataInputStream in) throws IOException {
812       // First clear the map.  Otherwise we will just accumulate entries every time this method is called.
813       this.map.clear();
814       // Read the number of entries in the map
815       int entries = in.readInt();
816       // Then read each key/value pair
817       for (int i = 0; i < entries; i++) {
818         byte [] key = Bytes.readByteArray(in);
819         // We used to read a byte that encoded the class type.  Read and ignore it because it is always byte [] in hfile
820         in.readByte();
821         byte [] value = Bytes.readByteArray(in);
822         this.map.put(key, value);
823       }
824     }
825 
826     /**
827      * Fill our map with content of the pb we read off disk
828      * @param fip protobuf message to read
829      */
830     void parsePB(final HFileProtos.FileInfoProto fip) {
831       this.map.clear();
832       for (BytesBytesPair pair: fip.getMapEntryList()) {
833         this.map.put(pair.getFirst().toByteArray(), pair.getSecond().toByteArray());
834       }
835     }
836   }
837 
838   /** Return true if the given file info key is reserved for internal use. */
839   public static boolean isReservedFileInfoKey(byte[] key) {
840     return Bytes.startsWith(key, FileInfo.RESERVED_PREFIX_BYTES);
841   }
842 
843   /**
844    * Get names of supported compression algorithms. The names are acceptable by
845    * HFile.Writer.
846    *
847    * @return Array of strings, each represents a supported compression
848    *         algorithm. Currently, the following compression algorithms are
849    *         supported.
850    *         <ul>
851    *         <li>"none" - No compression.
852    *         <li>"gz" - GZIP compression.
853    *         </ul>
854    */
855   public static String[] getSupportedCompressionAlgorithms() {
856     return Compression.getSupportedAlgorithms();
857   }
858 
859   // Utility methods.
860   /*
861    * @param l Long to convert to an int.
862    * @return <code>l</code> cast as an int.
863    */
864   static int longToInt(final long l) {
865     // Expecting the size() of a block not exceeding 4GB. Assuming the
866     // size() will wrap to negative integer if it exceeds 2GB (From tfile).
867     return (int)(l & 0x00000000ffffffffL);
868   }
869 
870   /**
871    * Returns all files belonging to the given region directory. Could return an
872    * empty list.
873    *
874    * @param fs  The file system reference.
875    * @param regionDir  The region directory to scan.
876    * @return The list of files found.
877    * @throws IOException When scanning the files fails.
878    */
879   static List<Path> getStoreFiles(FileSystem fs, Path regionDir)
880       throws IOException {
881     List<Path> res = new ArrayList<Path>();
882     PathFilter dirFilter = new FSUtils.DirFilter(fs);
883     FileStatus[] familyDirs = fs.listStatus(regionDir, dirFilter);
884     for(FileStatus dir : familyDirs) {
885       FileStatus[] files = fs.listStatus(dir.getPath());
886       for (FileStatus file : files) {
887         if (!file.isDir()) {
888           res.add(file.getPath());
889         }
890       }
891     }
892     return res;
893   }
894 
895   public static void main(String[] args) throws IOException {
896     HFilePrettyPrinter prettyPrinter = new HFilePrettyPrinter();
897     System.exit(prettyPrinter.run(args));
898   }
899 
900   /**
901    * Checks the given {@link HFile} format version, and throws an exception if
902    * invalid. Note that if the version number comes from an input file and has
903    * not been verified, the caller needs to re-throw an {@link IOException} to
904    * indicate that this is not a software error, but corrupted input.
905    *
906    * @param version an HFile version
907    * @throws IllegalArgumentException if the version is invalid
908    */
909   public static void checkFormatVersion(int version)
910       throws IllegalArgumentException {
911     if (version < MIN_FORMAT_VERSION || version > MAX_FORMAT_VERSION) {
912       throw new IllegalArgumentException("Invalid HFile version: " + version
913           + " (expected to be " + "between " + MIN_FORMAT_VERSION + " and "
914           + MAX_FORMAT_VERSION + ")");
915     }
916   }
917 }