View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.io.hfile;
20  
21  import java.io.ByteArrayInputStream;
22  import java.io.Closeable;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.DataOutputStream;
26  import java.io.IOException;
27  import java.io.SequenceInputStream;
28  import java.nio.ByteBuffer;
29  import java.util.ArrayList;
30  import java.util.Collection;
31  import java.util.Comparator;
32  import java.util.List;
33  import java.util.Map;
34  import java.util.Set;
35  import java.util.SortedMap;
36  import java.util.TreeMap;
37  import java.util.concurrent.ArrayBlockingQueue;
38  import java.util.concurrent.BlockingQueue;
39  import java.util.concurrent.atomic.AtomicInteger;
40  import java.util.concurrent.atomic.AtomicLong;
41  
42  import org.apache.commons.logging.Log;
43  import org.apache.commons.logging.LogFactory;
44  import org.apache.hadoop.classification.InterfaceAudience;
45  import org.apache.hadoop.conf.Configuration;
46  import org.apache.hadoop.fs.FSDataInputStream;
47  import org.apache.hadoop.fs.FSDataOutputStream;
48  import org.apache.hadoop.fs.FileStatus;
49  import org.apache.hadoop.fs.FileSystem;
50  import org.apache.hadoop.fs.Path;
51  import org.apache.hadoop.fs.PathFilter;
52  import org.apache.hadoop.hbase.HColumnDescriptor;
53  import org.apache.hadoop.hbase.HConstants;
54  import org.apache.hadoop.hbase.KeyValue;
55  import org.apache.hadoop.hbase.KeyValue.KeyComparator;
56  import org.apache.hadoop.hbase.exceptions.CorruptHFileException;
57  import org.apache.hadoop.hbase.fs.HFileSystem;
58  import org.apache.hadoop.hbase.io.compress.Compression;
59  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
60  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
61  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
62  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.BytesBytesPair;
63  import org.apache.hadoop.hbase.protobuf.generated.HFileProtos;
64  import org.apache.hadoop.hbase.regionserver.StoreFile.WriterBuilder;
65  import org.apache.hadoop.hbase.util.BloomFilterWriter;
66  import org.apache.hadoop.hbase.util.Bytes;
67  import org.apache.hadoop.hbase.util.ChecksumType;
68  import org.apache.hadoop.hbase.util.FSUtils;
69  import org.apache.hadoop.io.RawComparator;
70  import org.apache.hadoop.io.Writable;
71  
72  import com.google.common.base.Preconditions;
73  import com.google.common.collect.Lists;
74  import com.google.protobuf.ByteString;
75  
76  /**
77   * File format for hbase.
78   * A file of sorted key/value pairs. Both keys and values are byte arrays.
79   * <p>
80   * The memory footprint of a HFile includes the following (below is taken from the
81   * <a
82   * href=https://issues.apache.org/jira/browse/HADOOP-3315>TFile</a> documentation
83   * but applies also to HFile):
84   * <ul>
85   * <li>Some constant overhead of reading or writing a compressed block.
86   * <ul>
87   * <li>Each compressed block requires one compression/decompression codec for
88   * I/O.
89   * <li>Temporary space to buffer the key.
90   * <li>Temporary space to buffer the value.
91   * </ul>
92   * <li>HFile index, which is proportional to the total number of Data Blocks.
93   * The total amount of memory needed to hold the index can be estimated as
94   * (56+AvgKeySize)*NumBlocks.
95   * </ul>
96   * Suggestions on performance optimization.
97   * <ul>
98   * <li>Minimum block size. We recommend a setting of minimum block size between
99   * 8KB to 1MB for general usage. Larger block size is preferred if files are
100  * primarily for sequential access. However, it would lead to inefficient random
101  * access (because there are more data to decompress). Smaller blocks are good
102  * for random access, but require more memory to hold the block index, and may
103  * be slower to create (because we must flush the compressor stream at the
104  * conclusion of each data block, which leads to an FS I/O flush). Further, due
105  * to the internal caching in Compression codec, the smallest possible block
106  * size would be around 20KB-30KB.
107  * <li>The current implementation does not offer true multi-threading for
108  * reading. The implementation uses FSDataInputStream seek()+read(), which is
109  * shown to be much faster than positioned-read call in single thread mode.
110  * However, it also means that if multiple threads attempt to access the same
111  * HFile (using multiple scanners) simultaneously, the actual I/O is carried out
112  * sequentially even if they access different DFS blocks (Reexamine! pread seems
113  * to be 10% faster than seek+read in my testing -- stack).
114  * <li>Compression codec. Use "none" if the data is not very compressable (by
115  * compressable, I mean a compression ratio at least 2:1). Generally, use "lzo"
116  * as the starting point for experimenting. "gz" overs slightly better
117  * compression ratio over "lzo" but requires 4x CPU to compress and 2x CPU to
118  * decompress, comparing to "lzo".
119  * </ul>
120  *
121  * For more on the background behind HFile, see <a
122  * href=https://issues.apache.org/jira/browse/HBASE-61>HBASE-61</a>.
123  * <p>
124  * File is made of data blocks followed by meta data blocks (if any), a fileinfo
125  * block, data block index, meta data block index, and a fixed size trailer
126  * which records the offsets at which file changes content type.
127  * <pre>&lt;data blocks>&lt;meta blocks>&lt;fileinfo>&lt;data index>&lt;meta index>&lt;trailer></pre>
128  * Each block has a bit of magic at its start.  Block are comprised of
129  * key/values.  In data blocks, they are both byte arrays.  Metadata blocks are
130  * a String key and a byte array value.  An empty file looks like this:
131  * <pre>&lt;fileinfo>&lt;trailer></pre>.  That is, there are not data nor meta
132  * blocks present.
133  * <p>
134  * TODO: Do scanners need to be able to take a start and end row?
135  * TODO: Should BlockIndex know the name of its file?  Should it have a Path
136  * that points at its file say for the case where an index lives apart from
137  * an HFile instance?
138  */
139 @InterfaceAudience.Private
140 public class HFile {
141   static final Log LOG = LogFactory.getLog(HFile.class);
142 
143   /**
144    * Maximum length of key in HFile.
145    */
146   public final static int MAXIMUM_KEY_LENGTH = Integer.MAX_VALUE;
147 
148   /**
149    * Default compression: none.
150    */
151   public final static Compression.Algorithm DEFAULT_COMPRESSION_ALGORITHM =
152     Compression.Algorithm.NONE;
153 
154   /** Minimum supported HFile format version */
155   public static final int MIN_FORMAT_VERSION = 1;
156 
157   /** Maximum supported HFile format version
158    */
159   public static final int MAX_FORMAT_VERSION = 2;
160 
161   /** Default compression name: none. */
162   public final static String DEFAULT_COMPRESSION =
163     DEFAULT_COMPRESSION_ALGORITHM.getName();
164 
165   /** Meta data block name for bloom filter bits. */
166   public static final String BLOOM_FILTER_DATA_KEY = "BLOOM_FILTER_DATA";
167 
168   /**
169    * We assume that HFile path ends with
170    * ROOT_DIR/TABLE_NAME/REGION_NAME/CF_NAME/HFILE, so it has at least this
171    * many levels of nesting. This is needed for identifying table and CF name
172    * from an HFile path.
173    */
174   public final static int MIN_NUM_HFILE_PATH_LEVELS = 5;
175 
176   /**
177    * The number of bytes per checksum.
178    */
179   public static final int DEFAULT_BYTES_PER_CHECKSUM = 16 * 1024;
180   public static final ChecksumType DEFAULT_CHECKSUM_TYPE = ChecksumType.CRC32;
181 
182   // For measuring latency of "sequential" reads and writes
183   private static final AtomicInteger readOps = new AtomicInteger();
184   private static final AtomicLong readTimeNano = new AtomicLong();
185   private static final AtomicInteger writeOps = new AtomicInteger();
186   private static final AtomicLong writeTimeNano = new AtomicLong();
187 
188   // For measuring latency of pread
189   private static final AtomicInteger preadOps = new AtomicInteger();
190   private static final AtomicLong preadTimeNano = new AtomicLong();
191 
192   // For measuring number of checksum failures
193   static final AtomicLong checksumFailures = new AtomicLong();
194 
195   // For getting more detailed stats on FS latencies
196   // If, for some reason, the metrics subsystem stops polling for latencies, 
197   // I don't want data to pile up in a memory leak
198   // so, after LATENCY_BUFFER_SIZE items have been enqueued for processing,
199   // fs latency stats will be dropped (and this behavior will be logged)
200   private static final int LATENCY_BUFFER_SIZE = 5000;
201   private static final BlockingQueue<Long> fsReadLatenciesNanos = 
202       new ArrayBlockingQueue<Long>(LATENCY_BUFFER_SIZE);
203   private static final BlockingQueue<Long> fsWriteLatenciesNanos = 
204       new ArrayBlockingQueue<Long>(LATENCY_BUFFER_SIZE);
205   private static final BlockingQueue<Long> fsPreadLatenciesNanos = 
206       new ArrayBlockingQueue<Long>(LATENCY_BUFFER_SIZE);
207   
208   public static final void offerReadLatency(long latencyNanos, boolean pread) {
209     if (pread) {
210       fsPreadLatenciesNanos.offer(latencyNanos); // might be silently dropped, if the queue is full
211       preadOps.incrementAndGet();
212       preadTimeNano.addAndGet(latencyNanos);
213     } else {
214       fsReadLatenciesNanos.offer(latencyNanos); // might be silently dropped, if the queue is full
215       readTimeNano.addAndGet(latencyNanos);
216       readOps.incrementAndGet();
217     }
218   }
219   
220   public static final void offerWriteLatency(long latencyNanos) {
221     fsWriteLatenciesNanos.offer(latencyNanos); // might be silently dropped, if the queue is full
222     
223     writeTimeNano.addAndGet(latencyNanos);
224     writeOps.incrementAndGet();
225   }
226   
227   public static final Collection<Long> getReadLatenciesNanos() {
228     final List<Long> latencies = 
229         Lists.newArrayListWithCapacity(fsReadLatenciesNanos.size());
230     fsReadLatenciesNanos.drainTo(latencies);
231     return latencies;
232   }
233 
234   public static final Collection<Long> getPreadLatenciesNanos() {
235     final List<Long> latencies = 
236         Lists.newArrayListWithCapacity(fsPreadLatenciesNanos.size());
237     fsPreadLatenciesNanos.drainTo(latencies);
238     return latencies;
239   }
240   
241   public static final Collection<Long> getWriteLatenciesNanos() {
242     final List<Long> latencies = 
243         Lists.newArrayListWithCapacity(fsWriteLatenciesNanos.size());
244     fsWriteLatenciesNanos.drainTo(latencies);
245     return latencies;
246   }
247 
248   // for test purpose
249   public static volatile AtomicLong dataBlockReadCnt = new AtomicLong(0);
250 
251   // number of sequential reads
252   public static final int getReadOps() {
253     return readOps.getAndSet(0);
254   }
255 
256   public static final long getReadTimeMs() {
257     return readTimeNano.getAndSet(0) / 1000000;
258   }
259 
260   // number of positional reads
261   public static final int getPreadOps() {
262     return preadOps.getAndSet(0);
263   }
264 
265   public static final long getPreadTimeMs() {
266     return preadTimeNano.getAndSet(0) / 1000000;
267   }
268 
269   public static final int getWriteOps() {
270     return writeOps.getAndSet(0);
271   }
272 
273   public static final long getWriteTimeMs() {
274     return writeTimeNano.getAndSet(0) / 1000000;
275   }
276 
277   /**
278    * Number of checksum verification failures. It also
279    * clears the counter.
280    */
281   public static final long getChecksumFailuresCount() {
282     return checksumFailures.getAndSet(0);
283   }
284 
285   /** API required to write an {@link HFile} */
286   public interface Writer extends Closeable {
287 
288     /** Add an element to the file info map. */
289     void appendFileInfo(byte[] key, byte[] value) throws IOException;
290 
291     void append(KeyValue kv) throws IOException;
292 
293     void append(byte[] key, byte[] value) throws IOException;
294 
295     /** @return the path to this {@link HFile} */
296     Path getPath();
297 
298     /**
299      * Adds an inline block writer such as a multi-level block index writer or
300      * a compound Bloom filter writer.
301      */
302     void addInlineBlockWriter(InlineBlockWriter bloomWriter);
303 
304     // The below three methods take Writables.  We'd like to undo Writables but undoing the below would be pretty
305     // painful.  Could take a byte [] or a Message but we want to be backward compatible around hfiles so would need
306     // to map between Message and Writable or byte [] and current Writable serialization.  This would be a bit of work
307     // to little gain.  Thats my thinking at moment.  St.Ack 20121129
308 
309     void appendMetaBlock(String bloomFilterMetaKey, Writable metaWriter);
310 
311     /**
312      * Store general Bloom filter in the file. This does not deal with Bloom filter
313      * internals but is necessary, since Bloom filters are stored differently
314      * in HFile version 1 and version 2.
315      */
316     void addGeneralBloomFilter(BloomFilterWriter bfw);
317 
318     /**
319      * Store delete family Bloom filter in the file, which is only supported in
320      * HFile V2.
321      */
322     void addDeleteFamilyBloomFilter(BloomFilterWriter bfw) throws IOException;
323   }
324 
325   /**
326    * This variety of ways to construct writers is used throughout the code, and
327    * we want to be able to swap writer implementations.
328    */
329   public static abstract class WriterFactory {
330     protected final Configuration conf;
331     protected final CacheConfig cacheConf;
332     protected FileSystem fs;
333     protected Path path;
334     protected FSDataOutputStream ostream;
335     protected int blockSize = HColumnDescriptor.DEFAULT_BLOCKSIZE;
336     protected Compression.Algorithm compression =
337         HFile.DEFAULT_COMPRESSION_ALGORITHM;
338     protected HFileDataBlockEncoder encoder = NoOpDataBlockEncoder.INSTANCE;
339     protected KeyComparator comparator;
340     protected ChecksumType checksumType = HFile.DEFAULT_CHECKSUM_TYPE;
341     protected int bytesPerChecksum = DEFAULT_BYTES_PER_CHECKSUM;
342     protected boolean includeMVCCReadpoint = true;
343 
344     WriterFactory(Configuration conf, CacheConfig cacheConf) {
345       this.conf = conf;
346       this.cacheConf = cacheConf;
347     }
348 
349     public WriterFactory withPath(FileSystem fs, Path path) {
350       Preconditions.checkNotNull(fs);
351       Preconditions.checkNotNull(path);
352       this.fs = fs;
353       this.path = path;
354       return this;
355     }
356 
357     public WriterFactory withOutputStream(FSDataOutputStream ostream) {
358       Preconditions.checkNotNull(ostream);
359       this.ostream = ostream;
360       return this;
361     }
362 
363     public WriterFactory withBlockSize(int blockSize) {
364       this.blockSize = blockSize;
365       return this;
366     }
367 
368     public WriterFactory withCompression(Compression.Algorithm compression) {
369       Preconditions.checkNotNull(compression);
370       this.compression = compression;
371       return this;
372     }
373 
374     public WriterFactory withCompression(String compressAlgo) {
375       Preconditions.checkNotNull(compression);
376       this.compression = AbstractHFileWriter.compressionByName(compressAlgo);
377       return this;
378     }
379 
380     public WriterFactory withDataBlockEncoder(HFileDataBlockEncoder encoder) {
381       Preconditions.checkNotNull(encoder);
382       this.encoder = encoder;
383       return this;
384     }
385 
386     public WriterFactory withComparator(KeyComparator comparator) {
387       Preconditions.checkNotNull(comparator);
388       this.comparator = comparator;
389       return this;
390     }
391 
392     public WriterFactory withChecksumType(ChecksumType checksumType) {
393       Preconditions.checkNotNull(checksumType);
394       this.checksumType = checksumType;
395       return this;
396     }
397 
398     public WriterFactory withBytesPerChecksum(int bytesPerChecksum) {
399       this.bytesPerChecksum = bytesPerChecksum;
400       return this;
401     }
402 
403     /**
404      * @param includeMVCCReadpoint whether to write the mvcc readpoint to the file for each KV
405      * @return this (for chained invocation)
406      */
407     public WriterFactory includeMVCCReadpoint(boolean includeMVCCReadpoint) {
408       this.includeMVCCReadpoint = includeMVCCReadpoint;
409       return this;
410     }
411 
412     public Writer create() throws IOException {
413       if ((path != null ? 1 : 0) + (ostream != null ? 1 : 0) != 1) {
414         throw new AssertionError("Please specify exactly one of " +
415             "filesystem/path or path");
416       }
417       if (path != null) {
418         ostream = AbstractHFileWriter.createOutputStream(conf, fs, path);
419       }
420       return createWriter(fs, path, ostream, blockSize,
421           compression, encoder, comparator, checksumType, bytesPerChecksum, includeMVCCReadpoint);
422     }
423 
424     protected abstract Writer createWriter(FileSystem fs, Path path,
425         FSDataOutputStream ostream, int blockSize,
426         Compression.Algorithm compress,
427         HFileDataBlockEncoder dataBlockEncoder,
428         KeyComparator comparator, ChecksumType checksumType,
429         int bytesPerChecksum, boolean includeMVCCReadpoint) throws IOException;
430   }
431 
432   /** The configuration key for HFile version to use for new files */
433   public static final String FORMAT_VERSION_KEY = "hfile.format.version";
434 
435   public static int getFormatVersion(Configuration conf) {
436     int version = conf.getInt(FORMAT_VERSION_KEY, MAX_FORMAT_VERSION);
437     checkFormatVersion(version);
438     return version;
439   }
440 
441   /**
442    * Returns the factory to be used to create {@link HFile} writers.
443    * Disables block cache access for all writers created through the
444    * returned factory.
445    */
446   public static final WriterFactory getWriterFactoryNoCache(Configuration
447        conf) {
448     Configuration tempConf = new Configuration(conf);
449     tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
450     return HFile.getWriterFactory(conf, new CacheConfig(tempConf));
451   }
452 
453   /**
454    * Returns the factory to be used to create {@link HFile} writers
455    */
456   public static final WriterFactory getWriterFactory(Configuration conf,
457       CacheConfig cacheConf) {
458     int version = getFormatVersion(conf);
459     switch (version) {
460     case 2:
461       return new HFileWriterV2.WriterFactoryV2(conf, cacheConf);
462     default:
463       throw new IllegalArgumentException("Cannot create writer for HFile " +
464           "format version " + version);
465     }
466   }
467 
468   /** An abstraction used by the block index */
469   public interface CachingBlockReader {
470     HFileBlock readBlock(long offset, long onDiskBlockSize,
471         boolean cacheBlock, final boolean pread, final boolean isCompaction,
472         BlockType expectedBlockType)
473         throws IOException;
474   }
475 
476   /** An interface used by clients to open and iterate an {@link HFile}. */
477   public interface Reader extends Closeable, CachingBlockReader {
478     /**
479      * Returns this reader's "name". Usually the last component of the path.
480      * Needs to be constant as the file is being moved to support caching on
481      * write.
482      */
483     String getName();
484 
485     RawComparator<byte []> getComparator();
486 
487     HFileScanner getScanner(boolean cacheBlocks,
488        final boolean pread, final boolean isCompaction);
489 
490     ByteBuffer getMetaBlock(String metaBlockName,
491        boolean cacheBlock) throws IOException;
492 
493     Map<byte[], byte[]> loadFileInfo() throws IOException;
494 
495     byte[] getLastKey();
496 
497     byte[] midkey() throws IOException;
498 
499     long length();
500 
501     long getEntries();
502 
503     byte[] getFirstKey();
504 
505     long indexSize();
506 
507     byte[] getFirstRowKey();
508 
509     byte[] getLastRowKey();
510 
511     FixedFileTrailer getTrailer();
512 
513     HFileBlockIndex.BlockIndexReader getDataBlockIndexReader();
514 
515     HFileScanner getScanner(boolean cacheBlocks, boolean pread);
516 
517     Compression.Algorithm getCompressionAlgorithm();
518 
519     /**
520      * Retrieves general Bloom filter metadata as appropriate for each
521      * {@link HFile} version.
522      * Knows nothing about how that metadata is structured.
523      */
524     DataInput getGeneralBloomFilterMetadata() throws IOException;
525 
526     /**
527      * Retrieves delete family Bloom filter metadata as appropriate for each
528      * {@link HFile}  version.
529      * Knows nothing about how that metadata is structured.
530      */
531     DataInput getDeleteBloomFilterMetadata() throws IOException;
532 
533     Path getPath();
534 
535     /** Close method with optional evictOnClose */
536     void close(boolean evictOnClose) throws IOException;
537 
538     DataBlockEncoding getEncodingOnDisk();
539   }
540 
541   /**
542    * Method returns the reader given the specified arguments.
543    * TODO This is a bad abstraction.  See HBASE-6635.
544    *
545    * @param path hfile's path
546    * @param fsdis an open checksummed stream of path's file
547    * @param fsdisNoFsChecksum an open unchecksummed stream of path's file
548    * @param size max size of the trailer.
549    * @param closeIStream boolean for closing file after the getting the reader version.
550    * @param cacheConf Cache configuation values, cannot be null.
551    * @param preferredEncodingInCache
552    * @param hfs
553    * @return an appropriate instance of HFileReader
554    * @throws IOException If file is invalid, will throw CorruptHFileException flavored IOException
555    */
556   private static Reader pickReaderVersion(Path path, FSDataInputStream fsdis,
557       FSDataInputStream fsdisNoFsChecksum,
558       long size, boolean closeIStream, CacheConfig cacheConf,
559       DataBlockEncoding preferredEncodingInCache, HFileSystem hfs)
560       throws IOException {
561     FixedFileTrailer trailer = null;
562     try {
563       trailer = FixedFileTrailer.readFromStream(fsdis, size);
564     } catch (IllegalArgumentException iae) {
565       throw new CorruptHFileException("Problem reading HFile Trailer from file " + path, iae);
566     }
567     switch (trailer.getMajorVersion()) {
568     case 2:
569       return new HFileReaderV2(path, trailer, fsdis, fsdisNoFsChecksum,
570           size, closeIStream,
571           cacheConf, preferredEncodingInCache, hfs);
572     default:
573       throw new CorruptHFileException("Invalid HFile version " + trailer.getMajorVersion());
574     }
575   }
576 
577   /**
578    * @param fs A file system
579    * @param path Path to HFile
580    * @param cacheConf Cache configuration for hfile's contents
581    * @param preferredEncodingInCache Preferred in-cache data encoding algorithm.
582    * @return A version specific Hfile Reader
583    * @throws IOException If file is invalid, will throw CorruptHFileException flavored IOException
584    */
585   public static Reader createReaderWithEncoding(
586       FileSystem fs, Path path, CacheConfig cacheConf,
587       DataBlockEncoding preferredEncodingInCache) throws IOException {
588     final boolean closeIStream = true;
589     HFileSystem hfs = null;
590     FSDataInputStream fsdis = fs.open(path);
591     FSDataInputStream fsdisNoFsChecksum = fsdis;
592     // If the fs is not an instance of HFileSystem, then create an 
593     // instance of HFileSystem that wraps over the specified fs.
594     // In this case, we will not be able to avoid checksumming inside
595     // the filesystem.
596     if (!(fs instanceof HFileSystem)) {
597       hfs = new HFileSystem(fs);
598     } else {
599       hfs = (HFileSystem)fs;
600       // open a stream to read data without checksum verification in
601       // the filesystem
602       fsdisNoFsChecksum = hfs.getNoChecksumFs().open(path);
603     }
604     return pickReaderVersion(path, fsdis, fsdisNoFsChecksum,
605         fs.getFileStatus(path).getLen(), closeIStream, cacheConf,
606         preferredEncodingInCache, hfs);
607   }
608 
609   /**
610    * @param fs A file system
611    * @param path Path to HFile
612    * @param fsdis an open checksummed stream of path's file
613    * @param fsdisNoFsChecksum an open unchecksummed stream of path's file
614    * @param size max size of the trailer.
615    * @param cacheConf Cache configuration for hfile's contents
616    * @param preferredEncodingInCache Preferred in-cache data encoding algorithm.
617    * @param closeIStream boolean for closing file after the getting the reader version.
618    * @return A version specific Hfile Reader
619    * @throws IOException If file is invalid, will throw CorruptHFileException flavored IOException
620    */
621   public static Reader createReaderWithEncoding(
622       FileSystem fs, Path path, FSDataInputStream fsdis,
623       FSDataInputStream fsdisNoFsChecksum, long size, CacheConfig cacheConf,
624       DataBlockEncoding preferredEncodingInCache, boolean closeIStream)
625       throws IOException {
626     HFileSystem hfs = null;
627 
628     // If the fs is not an instance of HFileSystem, then create an
629     // instance of HFileSystem that wraps over the specified fs.
630     // In this case, we will not be able to avoid checksumming inside
631     // the filesystem.
632     if (!(fs instanceof HFileSystem)) {
633       hfs = new HFileSystem(fs);
634     } else {
635       hfs = (HFileSystem)fs;
636     }
637     return pickReaderVersion(path, fsdis, fsdisNoFsChecksum, size,
638                              closeIStream, cacheConf,
639                              preferredEncodingInCache, hfs);
640   }
641 
642   /**
643    *
644    * @param fs filesystem
645    * @param path Path to file to read
646    * @param cacheConf This must not be null.  @see {@link org.apache.hadoop.hbase.io.hfile.CacheConfig#CacheConfig(Configuration)}
647    * @return an active Reader instance
648    * @throws IOException Will throw a CorruptHFileException (DoNotRetryIOException subtype) if hfile is corrupt/invalid.
649    */
650   public static Reader createReader(
651       FileSystem fs, Path path, CacheConfig cacheConf) throws IOException {
652     Preconditions.checkNotNull(cacheConf, "Cannot create Reader with null CacheConf");
653     return createReaderWithEncoding(fs, path, cacheConf,
654         DataBlockEncoding.NONE);
655   }
656 
657   /**
658    * This factory method is used only by unit tests
659    */
660   static Reader createReaderFromStream(Path path,
661       FSDataInputStream fsdis, long size, CacheConfig cacheConf)
662       throws IOException {
663     final boolean closeIStream = false;
664     return pickReaderVersion(path, fsdis, fsdis, size, closeIStream, cacheConf,
665         DataBlockEncoding.NONE, null);
666   }
667 
668   /**
669    * Metadata for this file.  Conjured by the writer.  Read in by the reader.
670    */
671   static class FileInfo implements SortedMap<byte [], byte []> {
672     static final String RESERVED_PREFIX = "hfile.";
673     static final byte[] RESERVED_PREFIX_BYTES = Bytes.toBytes(RESERVED_PREFIX);
674     static final byte [] LASTKEY = Bytes.toBytes(RESERVED_PREFIX + "LASTKEY");
675     static final byte [] AVG_KEY_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_KEY_LEN");
676     static final byte [] AVG_VALUE_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_VALUE_LEN");
677     static final byte [] COMPARATOR = Bytes.toBytes(RESERVED_PREFIX + "COMPARATOR");
678     private final SortedMap<byte [], byte []> map = new TreeMap<byte [], byte []>(Bytes.BYTES_COMPARATOR);
679 
680     public FileInfo() {
681       super();
682     }
683 
684     /**
685      * Append the given key/value pair to the file info, optionally checking the
686      * key prefix.
687      *
688      * @param k key to add
689      * @param v value to add
690      * @param checkPrefix whether to check that the provided key does not start
691      *          with the reserved prefix
692      * @return this file info object
693      * @throws IOException if the key or value is invalid
694      */
695     public FileInfo append(final byte[] k, final byte[] v,
696         final boolean checkPrefix) throws IOException {
697       if (k == null || v == null) {
698         throw new NullPointerException("Key nor value may be null");
699       }
700       if (checkPrefix && isReservedFileInfoKey(k)) {
701         throw new IOException("Keys with a " + FileInfo.RESERVED_PREFIX
702             + " are reserved");
703       }
704       put(k, v);
705       return this;
706     }
707 
708     public void clear() {
709       this.map.clear();
710     }
711 
712     public Comparator<? super byte[]> comparator() {
713       return map.comparator();
714     }
715 
716     public boolean containsKey(Object key) {
717       return map.containsKey(key);
718     }
719 
720     public boolean containsValue(Object value) {
721       return map.containsValue(value);
722     }
723 
724     public Set<java.util.Map.Entry<byte[], byte[]>> entrySet() {
725       return map.entrySet();
726     }
727 
728     public boolean equals(Object o) {
729       return map.equals(o);
730     }
731 
732     public byte[] firstKey() {
733       return map.firstKey();
734     }
735 
736     public byte[] get(Object key) {
737       return map.get(key);
738     }
739 
740     public int hashCode() {
741       return map.hashCode();
742     }
743 
744     public SortedMap<byte[], byte[]> headMap(byte[] toKey) {
745       return this.map.headMap(toKey);
746     }
747 
748     public boolean isEmpty() {
749       return map.isEmpty();
750     }
751 
752     public Set<byte[]> keySet() {
753       return map.keySet();
754     }
755 
756     public byte[] lastKey() {
757       return map.lastKey();
758     }
759 
760     public byte[] put(byte[] key, byte[] value) {
761       return this.map.put(key, value);
762     }
763 
764     public void putAll(Map<? extends byte[], ? extends byte[]> m) {
765       this.map.putAll(m);
766     }
767 
768     public byte[] remove(Object key) {
769       return this.map.remove(key);
770     }
771 
772     public int size() {
773       return map.size();
774     }
775 
776     public SortedMap<byte[], byte[]> subMap(byte[] fromKey, byte[] toKey) {
777       return this.map.subMap(fromKey, toKey);
778     }
779 
780     public SortedMap<byte[], byte[]> tailMap(byte[] fromKey) {
781       return this.map.tailMap(fromKey);
782     }
783 
784     public Collection<byte[]> values() {
785       return map.values();
786     }
787 
788     /**
789      * Write out this instance on the passed in <code>out</code> stream.
790      * We write it as a protobuf.
791      * @param out
792      * @throws IOException
793      * @see {@link #read(DataInputStream)}
794      */
795     void write(final DataOutputStream out) throws IOException {
796       HFileProtos.FileInfoProto.Builder builder = HFileProtos.FileInfoProto.newBuilder();
797       for (Map.Entry<byte [], byte[]> e: this.map.entrySet()) {
798         HBaseProtos.BytesBytesPair.Builder bbpBuilder = HBaseProtos.BytesBytesPair.newBuilder();
799         bbpBuilder.setFirst(ByteString.copyFrom(e.getKey()));
800         bbpBuilder.setSecond(ByteString.copyFrom(e.getValue()));
801         builder.addMapEntry(bbpBuilder.build());
802       }
803       out.write(ProtobufUtil.PB_MAGIC);
804       builder.build().writeDelimitedTo(out);
805     }
806 
807     /**
808      * Populate this instance with what we find on the passed in <code>in</code> stream.
809      * Can deserialize protobuf of old Writables format.
810      * @param in
811      * @throws IOException
812      * @see {@link #write(DataOutputStream)}
813      */
814     void read(final DataInputStream in) throws IOException {
815       // This code is tested over in TestHFileReaderV1 where we read an old hfile w/ this new code.
816       int pblen = ProtobufUtil.lengthOfPBMagic();
817       byte [] pbuf = new byte[pblen];
818       if (in.markSupported()) in.mark(pblen);
819       int read = in.read(pbuf);
820       if (read != pblen) throw new IOException("read=" + read + ", wanted=" + pblen);
821       if (ProtobufUtil.isPBMagicPrefix(pbuf)) {
822         parsePB(HFileProtos.FileInfoProto.parseDelimitedFrom(in));
823       } else {
824         if (in.markSupported()) {
825           in.reset();
826           parseWritable(in);
827         } else {
828           // We cannot use BufferedInputStream, it consumes more than we read from the underlying IS
829           ByteArrayInputStream bais = new ByteArrayInputStream(pbuf);
830           SequenceInputStream sis = new SequenceInputStream(bais, in); // Concatenate input streams
831           // TODO: Am I leaking anything here wrapping the passed in stream?  We are not calling close on the wrapped
832           // streams but they should be let go after we leave this context?  I see that we keep a reference to the
833           // passed in inputstream but since we no longer have a reference to this after we leave, we should be ok.
834           parseWritable(new DataInputStream(sis));
835         }
836       }
837     }
838 
839     /** Now parse the old Writable format.  It was a list of Map entries.  Each map entry was a key and a value of
840      * a byte [].  The old map format had a byte before each entry that held a code which was short for the key or
841      * value type.  We know it was a byte [] so in below we just read and dump it.
842      * @throws IOException 
843      */
844     void parseWritable(final DataInputStream in) throws IOException {
845       // First clear the map.  Otherwise we will just accumulate entries every time this method is called.
846       this.map.clear();
847       // Read the number of entries in the map
848       int entries = in.readInt();
849       // Then read each key/value pair
850       for (int i = 0; i < entries; i++) {
851         byte [] key = Bytes.readByteArray(in);
852         // We used to read a byte that encoded the class type.  Read and ignore it because it is always byte [] in hfile
853         in.readByte();
854         byte [] value = Bytes.readByteArray(in);
855         this.map.put(key, value);
856       }
857     }
858 
859     /**
860      * Fill our map with content of the pb we read off disk
861      * @param fip protobuf message to read
862      */
863     void parsePB(final HFileProtos.FileInfoProto fip) {
864       this.map.clear();
865       for (BytesBytesPair pair: fip.getMapEntryList()) {
866         this.map.put(pair.getFirst().toByteArray(), pair.getSecond().toByteArray());
867       }
868     }
869   }
870 
871   /** Return true if the given file info key is reserved for internal use. */
872   public static boolean isReservedFileInfoKey(byte[] key) {
873     return Bytes.startsWith(key, FileInfo.RESERVED_PREFIX_BYTES);
874   }
875 
876   /**
877    * Get names of supported compression algorithms. The names are acceptable by
878    * HFile.Writer.
879    *
880    * @return Array of strings, each represents a supported compression
881    *         algorithm. Currently, the following compression algorithms are
882    *         supported.
883    *         <ul>
884    *         <li>"none" - No compression.
885    *         <li>"gz" - GZIP compression.
886    *         </ul>
887    */
888   public static String[] getSupportedCompressionAlgorithms() {
889     return Compression.getSupportedAlgorithms();
890   }
891 
892   // Utility methods.
893   /*
894    * @param l Long to convert to an int.
895    * @return <code>l</code> cast as an int.
896    */
897   static int longToInt(final long l) {
898     // Expecting the size() of a block not exceeding 4GB. Assuming the
899     // size() will wrap to negative integer if it exceeds 2GB (From tfile).
900     return (int)(l & 0x00000000ffffffffL);
901   }
902 
903   /**
904    * Returns all files belonging to the given region directory. Could return an
905    * empty list.
906    *
907    * @param fs  The file system reference.
908    * @param regionDir  The region directory to scan.
909    * @return The list of files found.
910    * @throws IOException When scanning the files fails.
911    */
912   static List<Path> getStoreFiles(FileSystem fs, Path regionDir)
913       throws IOException {
914     List<Path> res = new ArrayList<Path>();
915     PathFilter dirFilter = new FSUtils.DirFilter(fs);
916     FileStatus[] familyDirs = fs.listStatus(regionDir, dirFilter);
917     for(FileStatus dir : familyDirs) {
918       FileStatus[] files = fs.listStatus(dir.getPath());
919       for (FileStatus file : files) {
920         if (!file.isDir()) {
921           res.add(file.getPath());
922         }
923       }
924     }
925     return res;
926   }
927 
928   public static void main(String[] args) throws IOException {
929     HFilePrettyPrinter prettyPrinter = new HFilePrettyPrinter();
930     System.exit(prettyPrinter.run(args));
931   }
932 
933   /**
934    * Checks the given {@link HFile} format version, and throws an exception if
935    * invalid. Note that if the version number comes from an input file and has
936    * not been verified, the caller needs to re-throw an {@link IOException} to
937    * indicate that this is not a software error, but corrupted input.
938    *
939    * @param version an HFile version
940    * @throws IllegalArgumentException if the version is invalid
941    */
942   public static void checkFormatVersion(int version)
943       throws IllegalArgumentException {
944     if (version < MIN_FORMAT_VERSION || version > MAX_FORMAT_VERSION) {
945       throw new IllegalArgumentException("Invalid HFile version: " + version
946           + " (expected to be " + "between " + MIN_FORMAT_VERSION + " and "
947           + MAX_FORMAT_VERSION + ")");
948     }
949   }
950 }