View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.io.hfile;
20  
21  import java.io.DataInput;
22  import java.io.IOException;
23  import java.nio.ByteBuffer;
24  import java.util.ArrayList;
25  import java.util.List;
26  
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  import org.apache.hadoop.classification.InterfaceAudience;
30  import org.apache.hadoop.fs.FSDataInputStream;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.KeyValue;
34  import org.apache.hadoop.hbase.fs.HFileSystem;
35  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
36  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
37  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
38  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.hbase.util.IdLock;
41  import org.apache.hadoop.io.WritableUtils;
42  import org.cloudera.htrace.Trace;
43  import org.cloudera.htrace.TraceScope;
44  
45  /**
46   * {@link HFile} reader for version 2.
47   */
48  @InterfaceAudience.Private
49  public class HFileReaderV2 extends AbstractHFileReader {
50  
51    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
52  
53    /**
54     * The size of a (key length, value length) tuple that prefixes each entry in
55     * a data block.
56     */
57    private static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
58  
59    private boolean includesMemstoreTS = false;
60    private boolean decodeMemstoreTS = false;
61  
62    private boolean shouldIncludeMemstoreTS() {
63      return includesMemstoreTS;
64    }
65  
66    /** Filesystem-level block reader. */
67    private HFileBlock.FSReader fsBlockReader;
68  
69    /**
70     * A "sparse lock" implementation allowing to lock on a particular block
71     * identified by offset. The purpose of this is to avoid two clients loading
72     * the same block, and have all but one client wait to get the block from the
73     * cache.
74     */
75    private IdLock offsetLock = new IdLock();
76  
77    /**
78     * Blocks read from the load-on-open section, excluding data root index, meta
79     * index, and file info.
80     */
81    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
82  
83    /** Minimum minor version supported by this HFile format */
84    static final int MIN_MINOR_VERSION = 0;
85  
86    /** Maximum minor version supported by this HFile format */
87    // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
88    // the file. This version can read Writables version 1.
89    static final int MAX_MINOR_VERSION = 3;
90  
91    /** Minor versions starting with this number have faked index key */
92    static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
93  
94    /**
95     * Opens a HFile. You must load the index before you can use it by calling
96     * {@link #loadFileInfo()}.
97     *
98     * @param path Path to HFile.
99     * @param trailer File trailer.
100    * @param fsdis input stream.
101    * @param size Length of the stream.
102    * @param cacheConf Cache configuration.
103    * @param preferredEncodingInCache the encoding to use in cache in case we
104    *          have a choice. If the file is already encoded on disk, we will
105    *          still use its on-disk encoding in cache.
106    */
107   public HFileReaderV2(Path path, FixedFileTrailer trailer,
108       final FSDataInputStreamWrapper fsdis, final long size, final CacheConfig cacheConf,
109       DataBlockEncoding preferredEncodingInCache, final HFileSystem hfs)
110       throws IOException {
111     super(path, trailer, size, cacheConf, hfs);
112     trailer.expectMajorVersion(2);
113     validateMinorVersion(path, trailer.getMinorVersion());
114     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis,
115         compressAlgo, fileSize, trailer.getMinorVersion(), hfs, path);
116     this.fsBlockReader = fsBlockReaderV2; // upcast
117 
118     // Comparator class name is stored in the trailer in version 2.
119     comparator = trailer.createComparator();
120     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
121         trailer.getNumDataIndexLevels(), this);
122     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
123         Bytes.BYTES_RAWCOMPARATOR, 1);
124 
125     // Parse load-on-open data.
126 
127     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
128         trailer.getLoadOnOpenDataOffset(),
129         fileSize - trailer.getTrailerSize());
130 
131     // Data index. We also read statistics about the block index written after
132     // the root level.
133     dataBlockIndexReader.readMultiLevelIndexRoot(
134         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
135         trailer.getDataIndexCount());
136 
137     // Meta index.
138     metaBlockIndexReader.readRootIndex(
139         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
140         trailer.getMetaIndexCount());
141 
142     // File info
143     fileInfo = new FileInfo();
144     fileInfo.read(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
145     lastKey = fileInfo.get(FileInfo.LASTKEY);
146     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
147     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
148     byte [] keyValueFormatVersion =
149         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
150     includesMemstoreTS = keyValueFormatVersion != null &&
151         Bytes.toInt(keyValueFormatVersion) ==
152             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
153     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
154     if (includesMemstoreTS) {
155       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
156     }
157 
158     // Read data block encoding algorithm name from file info.
159     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo,
160         preferredEncodingInCache);
161     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
162 
163     // Store all other load-on-open blocks for further consumption.
164     HFileBlock b;
165     while ((b = blockIter.nextBlock()) != null) {
166       loadOnOpenBlocks.add(b);
167     }
168   }
169 
170   /**
171    * Create a Scanner on this file. No seeks or reads are done on creation. Call
172    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
173    * nothing to clean up in a Scanner. Letting go of your references to the
174    * scanner is sufficient.
175    *
176    * @param cacheBlocks True if we should cache blocks read in by this scanner.
177    * @param pread Use positional read rather than seek+read if true (pread is
178    *          better for random reads, seek+read is better scanning).
179    * @param isCompaction is scanner being used for a compaction?
180    * @return Scanner on this file.
181    */
182    @Override
183    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
184       final boolean isCompaction) {
185     // check if we want to use data block encoding in memory
186     if (dataBlockEncoder.useEncodedScanner(isCompaction)) {
187       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
188           includesMemstoreTS);
189     }
190 
191     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
192   }
193 
194   /**
195    * @param metaBlockName
196    * @param cacheBlock Add block to cache, if found
197    * @return block wrapped in a ByteBuffer, with header skipped
198    * @throws IOException
199    */
200   @Override
201   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
202       throws IOException {
203     if (trailer.getMetaIndexCount() == 0) {
204       return null; // there are no meta blocks
205     }
206     if (metaBlockIndexReader == null) {
207       throw new IOException("Meta index not loaded");
208     }
209 
210     byte[] mbname = Bytes.toBytes(metaBlockName);
211     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
212         mbname.length);
213     if (block == -1)
214       return null;
215     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
216     long startTimeNs = System.nanoTime();
217 
218     // Per meta key from any given file, synchronize reads for said block. This
219     // is OK to do for meta blocks because the meta block index is always
220     // single-level.
221     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
222       // Check cache for block. If found return.
223       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
224       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
225           DataBlockEncoding.NONE, BlockType.META);
226 
227       cacheBlock &= cacheConf.shouldCacheDataOnRead();
228       if (cacheConf.isBlockCacheEnabled()) {
229         HFileBlock cachedBlock =
230           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false);
231         if (cachedBlock != null) {
232           // Return a distinct 'shallow copy' of the block,
233           // so pos does not get messed by the scanner
234           return cachedBlock.getBufferWithoutHeader();
235         }
236         // Cache Miss, please load.
237       }
238 
239       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
240           blockSize, -1, true);
241 
242       final long delta = System.nanoTime() - startTimeNs;
243       HFile.offerReadLatency(delta, true);
244 
245       // Cache the block
246       if (cacheBlock) {
247         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
248             cacheConf.isInMemory());
249       }
250 
251       return metaBlock.getBufferWithoutHeader();
252     }
253   }
254 
255   /**
256    * Read in a file block.
257    * @param dataBlockOffset offset to read.
258    * @param onDiskBlockSize size of the block
259    * @param cacheBlock
260    * @param pread Use positional read instead of seek+read (positional is
261    *          better doing random reads whereas seek+read is better scanning).
262    * @param isCompaction is this block being read as part of a compaction
263    * @param expectedBlockType the block type we are expecting to read with this
264    *          read operation, or null to read whatever block type is available
265    *          and avoid checking (that might reduce caching efficiency of
266    *          encoded data blocks)
267    * @return Block wrapped in a ByteBuffer.
268    * @throws IOException
269    */
270   @Override
271   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
272       final boolean cacheBlock, boolean pread, final boolean isCompaction,
273       BlockType expectedBlockType)
274       throws IOException {
275     if (dataBlockIndexReader == null) {
276       throw new IOException("Block index not loaded");
277     }
278     if (dataBlockOffset < 0
279         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
280       throw new IOException("Requested block is out of range: "
281           + dataBlockOffset + ", lastDataBlockOffset: "
282           + trailer.getLastDataBlockOffset());
283     }
284     // For any given block from any given file, synchronize reads for said
285     // block.
286     // Without a cache, this synchronizing is needless overhead, but really
287     // the other choice is to duplicate work (which the cache would prevent you
288     // from doing).
289 
290     BlockCacheKey cacheKey =
291         new BlockCacheKey(name, dataBlockOffset,
292             dataBlockEncoder.getEffectiveEncodingInCache(isCompaction),
293             expectedBlockType);
294 
295     boolean useLock = false;
296     IdLock.Entry lockEntry = null;
297     TraceScope traceScope = Trace.startSpan("HFileReaderV2.readBlock");
298     try {
299       while (true) {
300         if (useLock) {
301           lockEntry = offsetLock.getLockEntry(dataBlockOffset);
302         }
303 
304         // Check cache for block. If found return.
305         if (cacheConf.isBlockCacheEnabled()) {
306           // Try and get the block from the block cache. If the useLock variable is true then this
307           // is the second time through the loop and it should not be counted as a block cache miss.
308           HFileBlock cachedBlock = (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey,
309               cacheBlock, useLock);
310           if (cachedBlock != null) {
311             if (cachedBlock.getBlockType() == BlockType.DATA) {
312               HFile.dataBlockReadCnt.incrementAndGet();
313             }
314 
315             validateBlockType(cachedBlock, expectedBlockType);
316 
317             // Validate encoding type for encoded blocks. We include encoding
318             // type in the cache key, and we expect it to match on a cache hit.
319             if (cachedBlock.getBlockType() == BlockType.ENCODED_DATA
320                 && cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getEncodingInCache()) {
321               throw new IOException("Cached block under key " + cacheKey + " "
322                   + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
323                   + dataBlockEncoder.getEncodingInCache() + ")");
324             }
325             return cachedBlock;
326           }
327           // Carry on, please load.
328         }
329         if (!useLock) {
330           // check cache again with lock
331           useLock = true;
332           continue;
333         }
334         if (Trace.isTracing()) {
335           traceScope.getSpan().addTimelineAnnotation("blockCacheMiss");
336         }
337         // Load block from filesystem.
338         long startTimeNs = System.nanoTime();
339         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, -1,
340             pread);
341         hfileBlock = dataBlockEncoder.diskToCacheFormat(hfileBlock, isCompaction);
342         validateBlockType(hfileBlock, expectedBlockType);
343 
344         final long delta = System.nanoTime() - startTimeNs;
345         HFile.offerReadLatency(delta, pread);
346 
347         // Cache the block if necessary
348         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(hfileBlock.getBlockType().getCategory())) {
349           cacheConf.getBlockCache().cacheBlock(cacheKey, hfileBlock, cacheConf.isInMemory());
350         }
351 
352         if (hfileBlock.getBlockType() == BlockType.DATA) {
353           HFile.dataBlockReadCnt.incrementAndGet();
354         }
355 
356         return hfileBlock;
357       }
358     } finally {
359       traceScope.close();
360       if (lockEntry != null) {
361         offsetLock.releaseLockEntry(lockEntry);
362       }
363     }
364   }
365 
366   /**
367    * Compares the actual type of a block retrieved from cache or disk with its
368    * expected type and throws an exception in case of a mismatch. Expected
369    * block type of {@link BlockType#DATA} is considered to match the actual
370    * block type [@link {@link BlockType#ENCODED_DATA} as well.
371    * @param block a block retrieved from cache or disk
372    * @param expectedBlockType the expected block type, or null to skip the
373    *          check
374    */
375   private void validateBlockType(HFileBlock block,
376       BlockType expectedBlockType) throws IOException {
377     if (expectedBlockType == null) {
378       return;
379     }
380     BlockType actualBlockType = block.getBlockType();
381     if (actualBlockType == BlockType.ENCODED_DATA &&
382         expectedBlockType == BlockType.DATA) {
383       // We consider DATA to match ENCODED_DATA for the purpose of this
384       // verification.
385       return;
386     }
387     if (actualBlockType != expectedBlockType) {
388       throw new IOException("Expected block type " + expectedBlockType + ", " +
389           "but got " + actualBlockType + ": " + block);
390     }
391   }
392 
393   /**
394    * @return Last key in the file. May be null if file has no entries. Note that
395    *         this is not the last row key, but rather the byte form of the last
396    *         KeyValue.
397    */
398   @Override
399   public byte[] getLastKey() {
400     return dataBlockIndexReader.isEmpty() ? null : lastKey;
401   }
402 
403   /**
404    * @return Midkey for this file. We work with block boundaries only so
405    *         returned midkey is an approximation only.
406    * @throws IOException
407    */
408   @Override
409   public byte[] midkey() throws IOException {
410     return dataBlockIndexReader.midkey();
411   }
412 
413   @Override
414   public void close() throws IOException {
415     close(cacheConf.shouldEvictOnClose());
416   }
417 
418   public void close(boolean evictOnClose) throws IOException {
419     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
420       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
421       if (LOG.isTraceEnabled()) {
422         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
423           + " block(s)");
424       }
425     }
426     fsBlockReader.closeStreams();
427   }
428 
429   /** For testing */
430   @Override
431   HFileBlock.FSReader getUncachedBlockReader() {
432     return fsBlockReader;
433   }
434 
435 
436   protected abstract static class AbstractScannerV2
437       extends AbstractHFileReader.Scanner {
438     protected HFileBlock block;
439 
440     /**
441      * The next indexed key is to keep track of the indexed key of the next data block.
442      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
443      * current data block is the last data block.
444      *
445      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
446      */
447     protected byte[] nextIndexedKey;
448 
449     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
450         final boolean pread, final boolean isCompaction) {
451       super(r, cacheBlocks, pread, isCompaction);
452     }
453 
454     /**
455      * An internal API function. Seek to the given key, optionally rewinding to
456      * the first key of the block before doing the seek.
457      *
458      * @param key key byte array
459      * @param offset key offset in the key byte array
460      * @param length key length
461      * @param rewind whether to rewind to the first key of the block before
462      *        doing the seek. If this is false, we are assuming we never go
463      *        back, otherwise the result is undefined.
464      * @return -1 if the key is earlier than the first key of the file,
465      *         0 if we are at the given key, 1 if we are past the given key
466      *         -2 if the key is earlier than the first key of the file while
467      *         using a faked index key
468      * @throws IOException
469      */
470     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
471         throws IOException {
472       HFileBlockIndex.BlockIndexReader indexReader =
473           reader.getDataBlockIndexReader();
474       BlockWithScanInfo blockWithScanInfo =
475         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
476             cacheBlocks, pread, isCompaction);
477       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
478         // This happens if the key e.g. falls before the beginning of the file.
479         return -1;
480       }
481       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
482           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
483     }
484 
485     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
486 
487     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
488         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
489         throws IOException;
490 
491     @Override
492     public int seekTo(byte[] key, int offset, int length) throws IOException {
493       // Always rewind to the first key of the block, because the given key
494       // might be before or after the current key.
495       return seekTo(key, offset, length, true);
496     }
497 
498     @Override
499     public int reseekTo(byte[] key, int offset, int length) throws IOException {
500       int compared;
501       if (isSeeked()) {
502         ByteBuffer bb = getKey();
503         compared = reader.getComparator().compare(key, offset,
504             length, bb.array(), bb.arrayOffset(), bb.limit());
505         if (compared < 1) {
506           // If the required key is less than or equal to current key, then
507           // don't do anything.
508           return compared;
509         } else {
510           if (this.nextIndexedKey != null &&
511               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
512                reader.getComparator().compare(key, offset, length,
513                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
514             // The reader shall continue to scan the current data block instead of querying the
515             // block index as long as it knows the target key is strictly smaller than
516             // the next indexed key or the current data block is the last data block.
517             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
518                 false, key, offset, length, false);
519           }
520         }
521       }
522       // Don't rewind on a reseek operation, because reseek implies that we are
523       // always going forward in the file.
524       return seekTo(key, offset, length, false);
525     }
526 
527     @Override
528     public boolean seekBefore(byte[] key, int offset, int length)
529         throws IOException {
530       HFileBlock seekToBlock =
531           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
532               block, cacheBlocks, pread, isCompaction);
533       if (seekToBlock == null) {
534         return false;
535       }
536       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
537 
538       if (reader.getComparator().compare(firstKey.array(),
539           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) == 0)
540       {
541         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
542         // The key we are interested in
543         if (previousBlockOffset == -1) {
544           // we have a 'problem', the key we want is the first of the file.
545           return false;
546         }
547 
548         // It is important that we compute and pass onDiskSize to the block
549         // reader so that it does not have to read the header separately to
550         // figure out the size.
551         seekToBlock = reader.readBlock(previousBlockOffset,
552             seekToBlock.getOffset() - previousBlockOffset, cacheBlocks,
553             pread, isCompaction, BlockType.DATA);
554         // TODO shortcut: seek forward in this block to the last key of the
555         // block.
556       }
557       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
558       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
559       return true;
560     }
561 
562 
563     /**
564      * Scans blocks in the "scanned" section of the {@link HFile} until the next
565      * data block is found.
566      *
567      * @return the next block, or null if there are no more data blocks
568      * @throws IOException
569      */
570     protected HFileBlock readNextDataBlock() throws IOException {
571       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
572       if (block == null)
573         return null;
574 
575       HFileBlock curBlock = block;
576 
577       do {
578         if (curBlock.getOffset() >= lastDataBlockOffset)
579           return null;
580 
581         if (curBlock.getOffset() < 0) {
582           throw new IOException("Invalid block file offset: " + block);
583         }
584 
585         // We are reading the next block without block type validation, because
586         // it might turn out to be a non-data block.
587         curBlock = reader.readBlock(curBlock.getOffset()
588             + curBlock.getOnDiskSizeWithHeader(),
589             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
590             isCompaction, null);
591       } while (!(curBlock.getBlockType().equals(BlockType.DATA) ||
592           curBlock.getBlockType().equals(BlockType.ENCODED_DATA)));
593 
594       return curBlock;
595     }
596   }
597 
598   /**
599    * Implementation of {@link HFileScanner} interface.
600    */
601   protected static class ScannerV2 extends AbstractScannerV2 {
602     private HFileReaderV2 reader;
603 
604     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
605         final boolean pread, final boolean isCompaction) {
606       super(r, cacheBlocks, pread, isCompaction);
607       this.reader = r;
608     }
609 
610     @Override
611     public KeyValue getKeyValue() {
612       if (!isSeeked())
613         return null;
614 
615       KeyValue ret = new KeyValue(blockBuffer.array(),
616           blockBuffer.arrayOffset() + blockBuffer.position(),
617           KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen,
618           currKeyLen);
619       if (this.reader.shouldIncludeMemstoreTS()) {
620         ret.setMvccVersion(currMemstoreTS);
621       }
622       return ret;
623     }
624 
625     @Override
626     public ByteBuffer getKey() {
627       assertSeeked();
628       return ByteBuffer.wrap(
629           blockBuffer.array(),
630           blockBuffer.arrayOffset() + blockBuffer.position()
631               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
632     }
633 
634     @Override
635     public ByteBuffer getValue() {
636       assertSeeked();
637       return ByteBuffer.wrap(
638           blockBuffer.array(),
639           blockBuffer.arrayOffset() + blockBuffer.position()
640               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
641     }
642 
643     private void setNonSeekedState() {
644       block = null;
645       blockBuffer = null;
646       currKeyLen = 0;
647       currValueLen = 0;
648       currMemstoreTS = 0;
649       currMemstoreTSLen = 0;
650     }
651 
652     /**
653      * Go to the next key/value in the block section. Loads the next block if
654      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
655      * be called.
656      *
657      * @return true if successfully navigated to the next key/value
658      */
659     @Override
660     public boolean next() throws IOException {
661       assertSeeked();
662 
663       try {
664         blockBuffer.position(blockBuffer.position() + KEY_VALUE_LEN_SIZE
665             + currKeyLen + currValueLen + currMemstoreTSLen);
666       } catch (IllegalArgumentException e) {
667         LOG.error("Current pos = " + blockBuffer.position()
668             + "; currKeyLen = " + currKeyLen + "; currValLen = "
669             + currValueLen + "; block limit = " + blockBuffer.limit()
670             + "; HFile name = " + reader.getName()
671             + "; currBlock currBlockOffset = " + block.getOffset());
672         throw e;
673       }
674 
675       if (blockBuffer.remaining() <= 0) {
676         long lastDataBlockOffset =
677             reader.getTrailer().getLastDataBlockOffset();
678 
679         if (block.getOffset() >= lastDataBlockOffset) {
680           setNonSeekedState();
681           return false;
682         }
683 
684         // read the next block
685         HFileBlock nextBlock = readNextDataBlock();
686         if (nextBlock == null) {
687           setNonSeekedState();
688           return false;
689         }
690 
691         updateCurrBlock(nextBlock);
692         return true;
693       }
694 
695       // We are still in the same block.
696       readKeyValueLen();
697       return true;
698     }
699 
700     /**
701      * Positions this scanner at the start of the file.
702      *
703      * @return false if empty file; i.e. a call to next would return false and
704      *         the current key and value are undefined.
705      * @throws IOException
706      */
707     @Override
708     public boolean seekTo() throws IOException {
709       if (reader == null) {
710         return false;
711       }
712 
713       if (reader.getTrailer().getEntryCount() == 0) {
714         // No data blocks.
715         return false;
716       }
717 
718       long firstDataBlockOffset =
719           reader.getTrailer().getFirstDataBlockOffset();
720       if (block != null && block.getOffset() == firstDataBlockOffset) {
721         blockBuffer.rewind();
722         readKeyValueLen();
723         return true;
724       }
725 
726       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
727           isCompaction, BlockType.DATA);
728       if (block.getOffset() < 0) {
729         throw new IOException("Invalid block offset: " + block.getOffset());
730       }
731       updateCurrBlock(block);
732       return true;
733     }
734 
735     @Override
736     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
737         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
738         throws IOException {
739       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
740         updateCurrBlock(seekToBlock);
741       } else if (rewind) {
742         blockBuffer.rewind();
743       }
744 
745       // Update the nextIndexedKey
746       this.nextIndexedKey = nextIndexedKey;
747       return blockSeek(key, offset, length, seekBefore);
748     }
749 
750     /**
751      * Updates the current block to be the given {@link HFileBlock}. Seeks to
752      * the the first key/value pair.
753      *
754      * @param newBlock the block to make current
755      */
756     private void updateCurrBlock(HFileBlock newBlock) {
757       block = newBlock;
758 
759       // sanity check
760       if (block.getBlockType() != BlockType.DATA) {
761         throw new IllegalStateException("ScannerV2 works only on data " +
762             "blocks, got " + block.getBlockType() + "; " +
763             "fileName=" + reader.name + ", " +
764             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
765             "isCompaction=" + isCompaction);
766       }
767 
768       blockBuffer = block.getBufferWithoutHeader();
769       readKeyValueLen();
770       blockFetches++;
771 
772       // Reset the next indexed key
773       this.nextIndexedKey = null;
774     }
775 
776     private final void readKeyValueLen() {
777       blockBuffer.mark();
778       currKeyLen = blockBuffer.getInt();
779       currValueLen = blockBuffer.getInt();
780       blockBuffer.reset();
781       if (this.reader.shouldIncludeMemstoreTS()) {
782         if (this.reader.decodeMemstoreTS) {
783           try {
784             int memstoreTSOffset = blockBuffer.arrayOffset()
785                 + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen
786                 + currValueLen;
787             currMemstoreTS = Bytes.readVLong(blockBuffer.array(),
788                 memstoreTSOffset);
789             currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
790           } catch (Exception e) {
791             throw new RuntimeException("Error reading memstore timestamp", e);
792           }
793         } else {
794           currMemstoreTS = 0;
795           currMemstoreTSLen = 1;
796         }
797       }
798 
799       if (currKeyLen < 0 || currValueLen < 0
800           || currKeyLen > blockBuffer.limit()
801           || currValueLen > blockBuffer.limit()) {
802         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
803             + " or currValueLen " + currValueLen + ". Block offset: "
804             + block.getOffset() + ", block length: " + blockBuffer.limit()
805             + ", position: " + blockBuffer.position() + " (without header).");
806       }
807     }
808 
809     /**
810      * Within a loaded block, seek looking for the last key that is smaller
811      * than (or equal to?) the key we are interested in.
812      *
813      * A note on the seekBefore: if you have seekBefore = true, AND the first
814      * key in the block = key, then you'll get thrown exceptions. The caller has
815      * to check for that case and load the previous block as appropriate.
816      *
817      * @param key the key to find
818      * @param seekBefore find the key before the given key in case of exact
819      *          match.
820      * @return 0 in case of an exact key match, 1 in case of an inexact match,
821      *         -2 in case of an inexact match and furthermore, the input key less
822      *         than the first key of current block(e.g. using a faked index key)
823      */
824     private int blockSeek(byte[] key, int offset, int length,
825         boolean seekBefore) {
826       int klen, vlen;
827       long memstoreTS = 0;
828       int memstoreTSLen = 0;
829       int lastKeyValueSize = -1;
830       do {
831         blockBuffer.mark();
832         klen = blockBuffer.getInt();
833         vlen = blockBuffer.getInt();
834         blockBuffer.reset();
835         if (this.reader.shouldIncludeMemstoreTS()) {
836           if (this.reader.decodeMemstoreTS) {
837             try {
838               int memstoreTSOffset = blockBuffer.arrayOffset()
839                   + blockBuffer.position() + KEY_VALUE_LEN_SIZE + klen + vlen;
840               memstoreTS = Bytes.readVLong(blockBuffer.array(),
841                   memstoreTSOffset);
842               memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
843             } catch (Exception e) {
844               throw new RuntimeException("Error reading memstore timestamp", e);
845             }
846           } else {
847             memstoreTS = 0;
848             memstoreTSLen = 1;
849           }
850         }
851 
852         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
853             + KEY_VALUE_LEN_SIZE;
854         int comp = reader.getComparator().compare(key, offset, length,
855             blockBuffer.array(), keyOffset, klen);
856 
857         if (comp == 0) {
858           if (seekBefore) {
859             if (lastKeyValueSize < 0) {
860               throw new IllegalStateException("blockSeek with seekBefore "
861                   + "at the first key of the block: key="
862                   + Bytes.toStringBinary(key) + ", blockOffset="
863                   + block.getOffset() + ", onDiskSize="
864                   + block.getOnDiskSizeWithHeader());
865             }
866             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
867             readKeyValueLen();
868             return 1; // non exact match.
869           }
870           currKeyLen = klen;
871           currValueLen = vlen;
872           if (this.reader.shouldIncludeMemstoreTS()) {
873             currMemstoreTS = memstoreTS;
874             currMemstoreTSLen = memstoreTSLen;
875           }
876           return 0; // indicate exact match
877         } else if (comp < 0) {
878           if (lastKeyValueSize > 0)
879             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
880           readKeyValueLen();
881           if (lastKeyValueSize == -1 && blockBuffer.position() == 0
882               && this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY) {
883             return HConstants.INDEX_KEY_MAGIC;
884           }
885           return 1;
886         }
887 
888         // The size of this key/value tuple, including key/value length fields.
889         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
890         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
891       } while (blockBuffer.remaining() > 0);
892 
893       // Seek to the last key we successfully read. This will happen if this is
894       // the last key/value pair in the file, in which case the following call
895       // to next() has to return false.
896       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
897       readKeyValueLen();
898       return 1; // didn't exactly find it.
899     }
900 
901     @Override
902     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
903       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
904       // It is safe to manipulate this buffer because we own the buffer object.
905       buffer.rewind();
906       int klen = buffer.getInt();
907       buffer.getInt();
908       ByteBuffer keyBuff = buffer.slice();
909       keyBuff.limit(klen);
910       keyBuff.rewind();
911       return keyBuff;
912     }
913 
914     @Override
915     public String getKeyString() {
916       return Bytes.toStringBinary(blockBuffer.array(),
917           blockBuffer.arrayOffset() + blockBuffer.position()
918               + KEY_VALUE_LEN_SIZE, currKeyLen);
919     }
920 
921     @Override
922     public String getValueString() {
923       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
924           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
925           currValueLen);
926     }
927   }
928 
929   /**
930    * ScannerV2 that operates on encoded data blocks.
931    */
932   protected static class EncodedScannerV2 extends AbstractScannerV2 {
933     private DataBlockEncoder.EncodedSeeker seeker = null;
934     private DataBlockEncoder dataBlockEncoder = null;
935     private final boolean includesMemstoreTS;
936 
937     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
938         boolean pread, boolean isCompaction, boolean includesMemstoreTS) {
939       super(reader, cacheBlocks, pread, isCompaction);
940       this.includesMemstoreTS = includesMemstoreTS;
941     }
942 
943     private void setDataBlockEncoder(DataBlockEncoder dataBlockEncoder) {
944       this.dataBlockEncoder = dataBlockEncoder;
945       seeker = dataBlockEncoder.createSeeker(reader.getComparator(),
946           includesMemstoreTS);
947     }
948 
949     /**
950      * Updates the current block to be the given {@link HFileBlock}. Seeks to
951      * the the first key/value pair.
952      *
953      * @param newBlock the block to make current
954      */
955     private void updateCurrentBlock(HFileBlock newBlock) {
956       block = newBlock;
957 
958       // sanity checks
959       if (block.getBlockType() != BlockType.ENCODED_DATA) {
960         throw new IllegalStateException(
961             "EncodedScannerV2 works only on encoded data blocks");
962       }
963 
964       short dataBlockEncoderId = block.getDataBlockEncodingId();
965       if (dataBlockEncoder == null ||
966           !DataBlockEncoding.isCorrectEncoder(dataBlockEncoder,
967               dataBlockEncoderId)) {
968         DataBlockEncoder encoder =
969             DataBlockEncoding.getDataBlockEncoderById(dataBlockEncoderId);
970         setDataBlockEncoder(encoder);
971       }
972 
973       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
974       blockFetches++;
975     }
976 
977     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
978       ByteBuffer origBlock = newBlock.getBufferReadOnly();
979       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
980           origBlock.arrayOffset() + newBlock.headerSize() +
981           DataBlockEncoding.ID_SIZE,
982           newBlock.getUncompressedSizeWithoutHeader() -
983           DataBlockEncoding.ID_SIZE).slice();
984       return encodedBlock;
985     }
986 
987     @Override
988     public boolean seekTo() throws IOException {
989       if (reader == null) {
990         return false;
991       }
992 
993       if (reader.getTrailer().getEntryCount() == 0) {
994         // No data blocks.
995         return false;
996       }
997 
998       long firstDataBlockOffset =
999           reader.getTrailer().getFirstDataBlockOffset();
1000       if (block != null && block.getOffset() == firstDataBlockOffset) {
1001         seeker.rewind();
1002         return true;
1003       }
1004 
1005       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1006           isCompaction, BlockType.DATA);
1007       if (block.getOffset() < 0) {
1008         throw new IOException("Invalid block offset: " + block.getOffset());
1009       }
1010       updateCurrentBlock(block);
1011       return true;
1012     }
1013 
1014     @Override
1015     public boolean next() throws IOException {
1016       boolean isValid = seeker.next();
1017       if (!isValid) {
1018         block = readNextDataBlock();
1019         isValid = block != null;
1020         if (isValid) {
1021           updateCurrentBlock(block);
1022         }
1023       }
1024       return isValid;
1025     }
1026 
1027     @Override
1028     public ByteBuffer getKey() {
1029       assertValidSeek();
1030       return seeker.getKeyDeepCopy();
1031     }
1032 
1033     @Override
1034     public ByteBuffer getValue() {
1035       assertValidSeek();
1036       return seeker.getValueShallowCopy();
1037     }
1038 
1039     @Override
1040     public KeyValue getKeyValue() {
1041       if (block == null) {
1042         return null;
1043       }
1044       return seeker.getKeyValue();
1045     }
1046 
1047     @Override
1048     public String getKeyString() {
1049       ByteBuffer keyBuffer = getKey();
1050       return Bytes.toStringBinary(keyBuffer.array(),
1051           keyBuffer.arrayOffset(), keyBuffer.limit());
1052     }
1053 
1054     @Override
1055     public String getValueString() {
1056       ByteBuffer valueBuffer = getValue();
1057       return Bytes.toStringBinary(valueBuffer.array(),
1058           valueBuffer.arrayOffset(), valueBuffer.limit());
1059     }
1060 
1061     private void assertValidSeek() {
1062       if (block == null) {
1063         throw new NotSeekedException();
1064       }
1065     }
1066 
1067     @Override
1068     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1069       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1070     }
1071 
1072     @Override
1073     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1074         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1075         throws IOException  {
1076       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1077         updateCurrentBlock(seekToBlock);
1078       } else if (rewind) {
1079         seeker.rewind();
1080       }
1081       this.nextIndexedKey = nextIndexedKey;
1082       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1083     }
1084   }
1085 
1086   /**
1087    * Returns a buffer with the Bloom filter metadata. The caller takes
1088    * ownership of the buffer.
1089    */
1090   @Override
1091   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1092     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1093   }
1094 
1095   @Override
1096   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1097     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1098   }
1099 
1100   private DataInput getBloomFilterMetadata(BlockType blockType)
1101   throws IOException {
1102     if (blockType != BlockType.GENERAL_BLOOM_META &&
1103         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1104       throw new RuntimeException("Block Type: " + blockType.toString() +
1105           " is not supported") ;
1106     }
1107 
1108     for (HFileBlock b : loadOnOpenBlocks)
1109       if (b.getBlockType() == blockType)
1110         return b.getByteStream();
1111     return null;
1112   }
1113 
1114   @Override
1115   public boolean isFileInfoLoaded() {
1116     return true; // We load file info in constructor in version 2.
1117   }
1118 
1119   /**
1120    * Validates that the minor version is within acceptable limits.
1121    * Otherwise throws an Runtime exception
1122    */
1123   private void validateMinorVersion(Path path, int minorVersion) {
1124     if (minorVersion < MIN_MINOR_VERSION ||
1125         minorVersion > MAX_MINOR_VERSION) {
1126       String msg = "Minor version for path " + path + 
1127                    " is expected to be between " +
1128                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1129                    " but is found to be " + minorVersion;
1130       LOG.error(msg);
1131       throw new RuntimeException(msg);
1132     }
1133   }
1134 }