View Javadoc

1   /*
2    * Copyright 2011 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.io.hfile;
21  
22  import java.io.DataInput;
23  import java.io.IOException;
24  import java.nio.ByteBuffer;
25  import java.util.ArrayList;
26  import java.util.List;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.fs.FSDataInputStream;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.KeyValue;
34  import org.apache.hadoop.hbase.fs.HFileSystem;
35  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
36  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
37  import org.apache.hadoop.hbase.io.hfile.BlockType.BlockCategory;
38  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.hbase.util.IdLock;
41  import org.apache.hadoop.io.WritableUtils;
42  
43  /**
44   * {@link HFile} reader for version 2.
45   */
46  public class HFileReaderV2 extends AbstractHFileReader {
47  
48    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
49  
50    /**
51     * The size of a (key length, value length) tuple that prefixes each entry in
52     * a data block.
53     */
54    private static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
55  
56    private boolean includesMemstoreTS = false;
57  
58    private boolean shouldIncludeMemstoreTS() {
59      return includesMemstoreTS;
60    }
61  
62    /**
63     * A "sparse lock" implementation allowing to lock on a particular block
64     * identified by offset. The purpose of this is to avoid two clients loading
65     * the same block, and have all but one client wait to get the block from the
66     * cache.
67     */
68    private IdLock offsetLock = new IdLock();
69  
70    /**
71     * Blocks read from the load-on-open section, excluding data root index, meta
72     * index, and file info.
73     */
74    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
75  
76    /** Minimum minor version supported by this HFile format */
77    static final int MIN_MINOR_VERSION = 0;
78  
79    /** Maximum minor version supported by this HFile format */
80    static final int MAX_MINOR_VERSION = 1;
81  
82    /**
83     * Opens a HFile. You must load the index before you can use it by calling
84     * {@link #loadFileInfo()}.
85     *
86     * @param path Path to HFile.
87     * @param trailer File trailer.
88     * @param fsdis input stream. Caller is responsible for closing the passed
89     *          stream.
90     * @param size Length of the stream.
91     * @param closeIStream Whether to close the stream.
92     * @param cacheConf Cache configuration.
93     * @param preferredEncodingInCache the encoding to use in cache in case we
94     *          have a choice. If the file is already encoded on disk, we will
95     *          still use its on-disk encoding in cache.
96     */
97    public HFileReaderV2(Path path, FixedFileTrailer trailer,
98        final FSDataInputStream fsdis, final FSDataInputStream fsdisNoFsChecksum,
99        final long size,
100       final boolean closeIStream, final CacheConfig cacheConf,
101       DataBlockEncoding preferredEncodingInCache, final HFileSystem hfs)
102       throws IOException {
103     super(path, trailer, fsdis, fsdisNoFsChecksum, size, 
104           closeIStream, cacheConf, hfs);
105     trailer.expectMajorVersion(2);
106     validateMinorVersion(path, trailer.getMinorVersion());
107     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis,
108         fsdisNoFsChecksum,
109         compressAlgo, fileSize, trailer.getMinorVersion(), hfs, path);
110     this.fsBlockReader = fsBlockReaderV2; // upcast
111 
112     // Comparator class name is stored in the trailer in version 2.
113     comparator = trailer.createComparator();
114     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
115         trailer.getNumDataIndexLevels(), this);
116     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
117         Bytes.BYTES_RAWCOMPARATOR, 1);
118 
119     // Parse load-on-open data.
120 
121     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
122         trailer.getLoadOnOpenDataOffset(),
123         fileSize - trailer.getTrailerSize());
124 
125     // Data index. We also read statistics about the block index written after
126     // the root level.
127     dataBlockIndexReader.readMultiLevelIndexRoot(
128         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
129         trailer.getDataIndexCount());
130 
131     // Meta index.
132     metaBlockIndexReader.readRootIndex(
133         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
134         trailer.getMetaIndexCount());
135 
136     // File info
137     fileInfo = new FileInfo();
138     fileInfo.readFields(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
139     lastKey = fileInfo.get(FileInfo.LASTKEY);
140     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
141     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
142     byte [] keyValueFormatVersion =
143         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
144     includesMemstoreTS = keyValueFormatVersion != null &&
145         Bytes.toInt(keyValueFormatVersion) ==
146             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
147     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
148 
149     // Read data block encoding algorithm name from file info.
150     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo,
151         preferredEncodingInCache);
152     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
153 
154     // Store all other load-on-open blocks for further consumption.
155     HFileBlock b;
156     while ((b = blockIter.nextBlock()) != null) {
157       loadOnOpenBlocks.add(b);
158     }
159   }
160 
161   /**
162    * Create a Scanner on this file. No seeks or reads are done on creation. Call
163    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
164    * nothing to clean up in a Scanner. Letting go of your references to the
165    * scanner is sufficient.
166    *
167    * @param cacheBlocks True if we should cache blocks read in by this scanner.
168    * @param pread Use positional read rather than seek+read if true (pread is
169    *          better for random reads, seek+read is better scanning).
170    * @param isCompaction is scanner being used for a compaction?
171    * @return Scanner on this file.
172    */
173    @Override
174    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
175       final boolean isCompaction) {
176     // check if we want to use data block encoding in memory
177     if (dataBlockEncoder.useEncodedScanner(isCompaction)) {
178       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
179           includesMemstoreTS);
180     }
181 
182     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
183   }
184 
185   /**
186    * @param metaBlockName
187    * @param cacheBlock Add block to cache, if found
188    * @return block wrapped in a ByteBuffer, with header skipped
189    * @throws IOException
190    */
191   @Override
192   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
193       throws IOException {
194     if (trailer.getMetaIndexCount() == 0) {
195       return null; // there are no meta blocks
196     }
197     if (metaBlockIndexReader == null) {
198       throw new IOException("Meta index not loaded");
199     }
200 
201     byte[] mbname = Bytes.toBytes(metaBlockName);
202     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
203         mbname.length);
204     if (block == -1)
205       return null;
206     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
207     long startTimeNs = System.nanoTime();
208 
209     // Per meta key from any given file, synchronize reads for said block. This
210     // is OK to do for meta blocks because the meta block index is always
211     // single-level.
212     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
213       // Check cache for block. If found return.
214       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
215       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
216           DataBlockEncoding.NONE, BlockType.META);
217 
218       cacheBlock &= cacheConf.shouldCacheDataOnRead();
219       if (cacheConf.isBlockCacheEnabled()) {
220         HFileBlock cachedBlock =
221           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false);
222         if (cachedBlock != null) {
223           // Return a distinct 'shallow copy' of the block,
224           // so pos does not get messed by the scanner
225           getSchemaMetrics().updateOnCacheHit(BlockCategory.META, false);
226           return cachedBlock.getBufferWithoutHeader();
227         }
228         // Cache Miss, please load.
229       }
230 
231       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
232           blockSize, -1, true);
233       passSchemaMetricsTo(metaBlock);
234 
235       final long delta = System.nanoTime() - startTimeNs;
236       HFile.offerReadLatency(delta, true);
237       getSchemaMetrics().updateOnCacheMiss(BlockCategory.META, false, delta);
238 
239       // Cache the block
240       if (cacheBlock) {
241         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
242             cacheConf.isInMemory());
243       }
244 
245       return metaBlock.getBufferWithoutHeader();
246     }
247   }
248 
249   /**
250    * Read in a file block.
251    * @param dataBlockOffset offset to read.
252    * @param onDiskBlockSize size of the block
253    * @param cacheBlock
254    * @param pread Use positional read instead of seek+read (positional is
255    *          better doing random reads whereas seek+read is better scanning).
256    * @param isCompaction is this block being read as part of a compaction
257    * @param expectedBlockType the block type we are expecting to read with this
258    *          read operation, or null to read whatever block type is available
259    *          and avoid checking (that might reduce caching efficiency of
260    *          encoded data blocks)
261    * @return Block wrapped in a ByteBuffer.
262    * @throws IOException
263    */
264   @Override
265   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
266       final boolean cacheBlock, boolean pread, final boolean isCompaction,
267       BlockType expectedBlockType)
268       throws IOException {
269     if (dataBlockIndexReader == null) {
270       throw new IOException("Block index not loaded");
271     }
272     if (dataBlockOffset < 0
273         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
274       throw new IOException("Requested block is out of range: "
275           + dataBlockOffset + ", lastDataBlockOffset: "
276           + trailer.getLastDataBlockOffset());
277     }
278     // For any given block from any given file, synchronize reads for said
279     // block.
280     // Without a cache, this synchronizing is needless overhead, but really
281     // the other choice is to duplicate work (which the cache would prevent you
282     // from doing).
283 
284     BlockCacheKey cacheKey =
285         new BlockCacheKey(name, dataBlockOffset,
286             dataBlockEncoder.getEffectiveEncodingInCache(isCompaction),
287             expectedBlockType);
288 
289     boolean useLock = false;
290     IdLock.Entry lockEntry = null;
291 
292     try {
293       while (true) {
294 
295         if (useLock) {
296           lockEntry = offsetLock.getLockEntry(dataBlockOffset);
297         }
298 
299         // Check cache for block. If found return.
300         if (cacheConf.isBlockCacheEnabled()) {
301           // Try and get the block from the block cache.  If the useLock variable is true then this
302           // is the second time through the loop and it should not be counted as a block cache miss.
303           HFileBlock cachedBlock = (HFileBlock)
304               cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, useLock);
305           if (cachedBlock != null) {
306             BlockCategory blockCategory =
307                 cachedBlock.getBlockType().getCategory();
308 
309             getSchemaMetrics().updateOnCacheHit(blockCategory, isCompaction);
310 
311             if (cachedBlock.getBlockType() == BlockType.DATA) {
312               HFile.dataBlockReadCnt.incrementAndGet();
313             }
314 
315             validateBlockType(cachedBlock, expectedBlockType);
316 
317             // Validate encoding type for encoded blocks. We include encoding
318             // type in the cache key, and we expect it to match on a cache hit.
319             if (cachedBlock.getBlockType() == BlockType.ENCODED_DATA &&
320                 cachedBlock.getDataBlockEncoding() !=
321                     dataBlockEncoder.getEncodingInCache()) {
322               throw new IOException("Cached block under key " + cacheKey + " " +
323                   "has wrong encoding: " + cachedBlock.getDataBlockEncoding() +
324                   " (expected: " + dataBlockEncoder.getEncodingInCache() + ")");
325             }
326             return cachedBlock;
327           }
328           // Carry on, please load.
329         }
330         if (!useLock) {
331           // check cache again with lock
332           useLock = true;
333           continue;
334         }
335 
336         // Load block from filesystem.
337         long startTimeNs = System.nanoTime();
338         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset,
339             onDiskBlockSize, -1, pread);
340         hfileBlock = dataBlockEncoder.diskToCacheFormat(hfileBlock,
341             isCompaction);
342         validateBlockType(hfileBlock, expectedBlockType);
343         passSchemaMetricsTo(hfileBlock);
344         BlockCategory blockCategory = hfileBlock.getBlockType().getCategory();
345 
346         final long delta = System.nanoTime() - startTimeNs;
347         HFile.offerReadLatency(delta, pread);
348         getSchemaMetrics().updateOnCacheMiss(blockCategory, isCompaction, delta);
349 
350         // Cache the block if necessary
351         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(
352             hfileBlock.getBlockType().getCategory())) {
353           cacheConf.getBlockCache().cacheBlock(cacheKey, hfileBlock,
354               cacheConf.isInMemory());
355         }
356 
357         if (hfileBlock.getBlockType() == BlockType.DATA) {
358           HFile.dataBlockReadCnt.incrementAndGet();
359         }
360 
361         return hfileBlock;
362       }
363     } finally {
364       if (lockEntry != null) {
365         offsetLock.releaseLockEntry(lockEntry);
366       }
367     }
368   }
369 
370   /**
371    * Compares the actual type of a block retrieved from cache or disk with its
372    * expected type and throws an exception in case of a mismatch. Expected
373    * block type of {@link BlockType#DATA} is considered to match the actual
374    * block type [@link {@link BlockType#ENCODED_DATA} as well.
375    * @param block a block retrieved from cache or disk
376    * @param expectedBlockType the expected block type, or null to skip the
377    *          check
378    */
379   private void validateBlockType(HFileBlock block,
380       BlockType expectedBlockType) throws IOException {
381     if (expectedBlockType == null) {
382       return;
383     }
384     BlockType actualBlockType = block.getBlockType();
385     if (actualBlockType == BlockType.ENCODED_DATA &&
386         expectedBlockType == BlockType.DATA) {
387       // We consider DATA to match ENCODED_DATA for the purpose of this
388       // verification.
389       return;
390     }
391     if (actualBlockType != expectedBlockType) {
392       throw new IOException("Expected block type " + expectedBlockType + ", " +
393           "but got " + actualBlockType + ": " + block);
394     }
395   }
396 
397   /**
398    * @return Last key in the file. May be null if file has no entries. Note that
399    *         this is not the last row key, but rather the byte form of the last
400    *         KeyValue.
401    */
402   @Override
403   public byte[] getLastKey() {
404     return dataBlockIndexReader.isEmpty() ? null : lastKey;
405   }
406 
407   /**
408    * @return Midkey for this file. We work with block boundaries only so
409    *         returned midkey is an approximation only.
410    * @throws IOException
411    */
412   @Override
413   public byte[] midkey() throws IOException {
414     return dataBlockIndexReader.midkey();
415   }
416 
417   @Override
418   public void close() throws IOException {
419     close(cacheConf.shouldEvictOnClose());
420   }
421 
422   public void close(boolean evictOnClose) throws IOException {
423     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
424       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
425       if (LOG.isTraceEnabled()) {
426         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
427           + " block(s)");
428       }
429     }
430     if (closeIStream) {
431       if (istream != istreamNoFsChecksum && istreamNoFsChecksum != null) {
432         istreamNoFsChecksum.close();
433         istreamNoFsChecksum = null;
434       }
435       if (istream != null) {
436         istream.close();
437         istream = null;
438       }
439     }
440 
441     getSchemaMetrics().flushMetrics();
442   }
443 
444   protected abstract static class AbstractScannerV2
445       extends AbstractHFileReader.Scanner {
446     protected HFileBlock block;
447 
448     /**
449      * The next indexed key is to keep track of the indexed key of the next data block.
450      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
451      * current data block is the last data block.
452      *
453      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
454      */
455     protected byte[] nextIndexedKey;
456 
457     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
458         final boolean pread, final boolean isCompaction) {
459       super(r, cacheBlocks, pread, isCompaction);
460     }
461 
462     /**
463      * An internal API function. Seek to the given key, optionally rewinding to
464      * the first key of the block before doing the seek.
465      *
466      * @param key key byte array
467      * @param offset key offset in the key byte array
468      * @param length key length
469      * @param rewind whether to rewind to the first key of the block before
470      *        doing the seek. If this is false, we are assuming we never go
471      *        back, otherwise the result is undefined.
472      * @return -1 if the key is earlier than the first key of the file,
473      *         0 if we are at the given key, and 1 if we are past the given key
474      * @throws IOException
475      */
476     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
477         throws IOException {
478       HFileBlockIndex.BlockIndexReader indexReader =
479           reader.getDataBlockIndexReader();
480       BlockWithScanInfo blockWithScanInfo =
481         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
482             cacheBlocks, pread, isCompaction);
483       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
484         // This happens if the key e.g. falls before the beginning of the file.
485         return -1;
486       }
487       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
488           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
489     }
490 
491     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
492 
493     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
494         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
495         throws IOException;
496 
497     @Override
498     public int seekTo(byte[] key, int offset, int length) throws IOException {
499       // Always rewind to the first key of the block, because the given key
500       // might be before or after the current key.
501       return seekTo(key, offset, length, true);
502     }
503 
504     @Override
505     public int reseekTo(byte[] key, int offset, int length) throws IOException {
506       int compared;
507       if (isSeeked()) {
508         ByteBuffer bb = getKey();
509         compared = reader.getComparator().compare(key, offset,
510             length, bb.array(), bb.arrayOffset(), bb.limit());
511         if (compared < 1) {
512           // If the required key is less than or equal to current key, then
513           // don't do anything.
514           return compared;
515         } else {
516           if (this.nextIndexedKey != null &&
517               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
518                reader.getComparator().compare(key, offset, length,
519                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
520             // The reader shall continue to scan the current data block instead of querying the
521             // block index as long as it knows the target key is strictly smaller than
522             // the next indexed key or the current data block is the last data block.
523             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
524                 false, key, offset, length, false);
525           }
526         }
527       }
528       // Don't rewind on a reseek operation, because reseek implies that we are
529       // always going forward in the file.
530       return seekTo(key, offset, length, false);
531     }
532 
533     @Override
534     public boolean seekBefore(byte[] key, int offset, int length)
535         throws IOException {
536       HFileBlock seekToBlock =
537           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
538               block, cacheBlocks, pread, isCompaction);
539       if (seekToBlock == null) {
540         return false;
541       }
542       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
543 
544       if (reader.getComparator().compare(firstKey.array(),
545           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) == 0)
546       {
547         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
548         // The key we are interested in
549         if (previousBlockOffset == -1) {
550           // we have a 'problem', the key we want is the first of the file.
551           return false;
552         }
553 
554         // It is important that we compute and pass onDiskSize to the block
555         // reader so that it does not have to read the header separately to
556         // figure out the size.
557         seekToBlock = reader.readBlock(previousBlockOffset,
558             seekToBlock.getOffset() - previousBlockOffset, cacheBlocks,
559             pread, isCompaction, BlockType.DATA);
560         // TODO shortcut: seek forward in this block to the last key of the
561         // block.
562       }
563       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
564       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
565       return true;
566     }
567 
568 
569     /**
570      * Scans blocks in the "scanned" section of the {@link HFile} until the next
571      * data block is found.
572      *
573      * @return the next block, or null if there are no more data blocks
574      * @throws IOException
575      */
576     protected HFileBlock readNextDataBlock() throws IOException {
577       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
578       if (block == null)
579         return null;
580 
581       HFileBlock curBlock = block;
582 
583       do {
584         if (curBlock.getOffset() >= lastDataBlockOffset)
585           return null;
586 
587         if (curBlock.getOffset() < 0) {
588           throw new IOException("Invalid block file offset: " + block);
589         }
590 
591         // We are reading the next block without block type validation, because
592         // it might turn out to be a non-data block.
593         curBlock = reader.readBlock(curBlock.getOffset()
594             + curBlock.getOnDiskSizeWithHeader(),
595             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
596             isCompaction, null);
597       } while (!(curBlock.getBlockType().equals(BlockType.DATA) ||
598           curBlock.getBlockType().equals(BlockType.ENCODED_DATA)));
599 
600       return curBlock;
601     }
602   }
603 
604   /**
605    * Implementation of {@link HFileScanner} interface.
606    */
607   protected static class ScannerV2 extends AbstractScannerV2 {
608     private HFileReaderV2 reader;
609 
610     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
611         final boolean pread, final boolean isCompaction) {
612       super(r, cacheBlocks, pread, isCompaction);
613       this.reader = r;
614     }
615 
616     @Override
617     public KeyValue getKeyValue() {
618       if (!isSeeked())
619         return null;
620 
621       KeyValue ret = new KeyValue(blockBuffer.array(),
622           blockBuffer.arrayOffset() + blockBuffer.position(),
623           KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen,
624           currKeyLen);
625       if (this.reader.shouldIncludeMemstoreTS()) {
626         ret.setMemstoreTS(currMemstoreTS);
627       }
628       return ret;
629     }
630 
631     @Override
632     public ByteBuffer getKey() {
633       assertSeeked();
634       return ByteBuffer.wrap(
635           blockBuffer.array(),
636           blockBuffer.arrayOffset() + blockBuffer.position()
637               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
638     }
639 
640     @Override
641     public ByteBuffer getValue() {
642       assertSeeked();
643       return ByteBuffer.wrap(
644           blockBuffer.array(),
645           blockBuffer.arrayOffset() + blockBuffer.position()
646               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
647     }
648 
649     private void setNonSeekedState() {
650       block = null;
651       blockBuffer = null;
652       currKeyLen = 0;
653       currValueLen = 0;
654       currMemstoreTS = 0;
655       currMemstoreTSLen = 0;
656     }
657 
658     /**
659      * Go to the next key/value in the block section. Loads the next block if
660      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
661      * be called.
662      *
663      * @return true if successfully navigated to the next key/value
664      */
665     @Override
666     public boolean next() throws IOException {
667       assertSeeked();
668 
669       try {
670         blockBuffer.position(blockBuffer.position() + KEY_VALUE_LEN_SIZE
671             + currKeyLen + currValueLen + currMemstoreTSLen);
672       } catch (IllegalArgumentException e) {
673         LOG.error("Current pos = " + blockBuffer.position()
674             + "; currKeyLen = " + currKeyLen + "; currValLen = "
675             + currValueLen + "; block limit = " + blockBuffer.limit()
676             + "; HFile name = " + reader.getName()
677             + "; currBlock currBlockOffset = " + block.getOffset());
678         throw e;
679       }
680 
681       if (blockBuffer.remaining() <= 0) {
682         long lastDataBlockOffset =
683             reader.getTrailer().getLastDataBlockOffset();
684 
685         if (block.getOffset() >= lastDataBlockOffset) {
686           setNonSeekedState();
687           return false;
688         }
689 
690         // read the next block
691         HFileBlock nextBlock = readNextDataBlock();
692         if (nextBlock == null) {
693           setNonSeekedState();
694           return false;
695         }
696 
697         updateCurrBlock(nextBlock);
698         return true;
699       }
700 
701       // We are still in the same block.
702       readKeyValueLen();
703       return true;
704     }
705 
706     /**
707      * Positions this scanner at the start of the file.
708      *
709      * @return false if empty file; i.e. a call to next would return false and
710      *         the current key and value are undefined.
711      * @throws IOException
712      */
713     @Override
714     public boolean seekTo() throws IOException {
715       if (reader == null) {
716         return false;
717       }
718 
719       if (reader.getTrailer().getEntryCount() == 0) {
720         // No data blocks.
721         return false;
722       }
723 
724       long firstDataBlockOffset =
725           reader.getTrailer().getFirstDataBlockOffset();
726       if (block != null && block.getOffset() == firstDataBlockOffset) {
727         blockBuffer.rewind();
728         readKeyValueLen();
729         return true;
730       }
731 
732       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
733           isCompaction, BlockType.DATA);
734       if (block.getOffset() < 0) {
735         throw new IOException("Invalid block offset: " + block.getOffset());
736       }
737       updateCurrBlock(block);
738       return true;
739     }
740 
741     @Override
742     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
743         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
744         throws IOException {
745       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
746         updateCurrBlock(seekToBlock);
747       } else if (rewind) {
748         blockBuffer.rewind();
749       }
750 
751       // Update the nextIndexedKey
752       this.nextIndexedKey = nextIndexedKey;
753       return blockSeek(key, offset, length, seekBefore);
754     }
755 
756     /**
757      * Updates the current block to be the given {@link HFileBlock}. Seeks to
758      * the the first key/value pair.
759      *
760      * @param newBlock the block to make current
761      */
762     private void updateCurrBlock(HFileBlock newBlock) {
763       block = newBlock;
764 
765       // sanity check
766       if (block.getBlockType() != BlockType.DATA) {
767         throw new IllegalStateException("ScannerV2 works only on data " +
768             "blocks, got " + block.getBlockType() + "; " +
769             "fileName=" + reader.name + ", " +
770             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
771             "isCompaction=" + isCompaction);
772       }
773 
774       blockBuffer = block.getBufferWithoutHeader();
775       readKeyValueLen();
776       blockFetches++;
777 
778       // Reset the next indexed key
779       this.nextIndexedKey = null;
780     }
781 
782     private final void readKeyValueLen() {
783       blockBuffer.mark();
784       currKeyLen = blockBuffer.getInt();
785       currValueLen = blockBuffer.getInt();
786       blockBuffer.reset();
787       if (this.reader.shouldIncludeMemstoreTS()) {
788         try {
789           int memstoreTSOffset = blockBuffer.arrayOffset()
790               + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen
791               + currValueLen;
792           currMemstoreTS = Bytes.readVLong(blockBuffer.array(),
793               memstoreTSOffset);
794           currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
795         } catch (Exception e) {
796           throw new RuntimeException("Error reading memstore timestamp", e);
797         }
798       }
799 
800       if (currKeyLen < 0 || currValueLen < 0
801           || currKeyLen > blockBuffer.limit()
802           || currValueLen > blockBuffer.limit()) {
803         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
804             + " or currValueLen " + currValueLen + ". Block offset: "
805             + block.getOffset() + ", block length: " + blockBuffer.limit()
806             + ", position: " + blockBuffer.position() + " (without header).");
807       }
808     }
809 
810     /**
811      * Within a loaded block, seek looking for the last key that is smaller
812      * than (or equal to?) the key we are interested in.
813      *
814      * A note on the seekBefore: if you have seekBefore = true, AND the first
815      * key in the block = key, then you'll get thrown exceptions. The caller has
816      * to check for that case and load the previous block as appropriate.
817      *
818      * @param key the key to find
819      * @param seekBefore find the key before the given key in case of exact
820      *          match.
821      * @return 0 in case of an exact key match, 1 in case of an inexact match
822      */
823     private int blockSeek(byte[] key, int offset, int length,
824         boolean seekBefore) {
825       int klen, vlen;
826       long memstoreTS = 0;
827       int memstoreTSLen = 0;
828       int lastKeyValueSize = -1;
829       do {
830         blockBuffer.mark();
831         klen = blockBuffer.getInt();
832         vlen = blockBuffer.getInt();
833         blockBuffer.reset();
834         if (this.reader.shouldIncludeMemstoreTS()) {
835           try {
836             int memstoreTSOffset = blockBuffer.arrayOffset()
837                 + blockBuffer.position() + KEY_VALUE_LEN_SIZE + klen + vlen;
838             memstoreTS = Bytes.readVLong(blockBuffer.array(),
839                 memstoreTSOffset);
840             memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
841           } catch (Exception e) {
842             throw new RuntimeException("Error reading memstore timestamp", e);
843           }
844         }
845 
846         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
847             + KEY_VALUE_LEN_SIZE;
848         int comp = reader.getComparator().compare(key, offset, length,
849             blockBuffer.array(), keyOffset, klen);
850 
851         if (comp == 0) {
852           if (seekBefore) {
853             if (lastKeyValueSize < 0) {
854               throw new IllegalStateException("blockSeek with seekBefore "
855                   + "at the first key of the block: key="
856                   + Bytes.toStringBinary(key) + ", blockOffset="
857                   + block.getOffset() + ", onDiskSize="
858                   + block.getOnDiskSizeWithHeader());
859             }
860             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
861             readKeyValueLen();
862             return 1; // non exact match.
863           }
864           currKeyLen = klen;
865           currValueLen = vlen;
866           if (this.reader.shouldIncludeMemstoreTS()) {
867             currMemstoreTS = memstoreTS;
868             currMemstoreTSLen = memstoreTSLen;
869           }
870           return 0; // indicate exact match
871         }
872 
873         if (comp < 0) {
874           if (lastKeyValueSize > 0)
875             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
876           readKeyValueLen();
877           return 1;
878         }
879 
880         // The size of this key/value tuple, including key/value length fields.
881         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
882         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
883       } while (blockBuffer.remaining() > 0);
884 
885       // Seek to the last key we successfully read. This will happen if this is
886       // the last key/value pair in the file, in which case the following call
887       // to next() has to return false.
888       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
889       readKeyValueLen();
890       return 1; // didn't exactly find it.
891     }
892 
893     @Override
894     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
895       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
896       // It is safe to manipulate this buffer because we own the buffer object.
897       buffer.rewind();
898       int klen = buffer.getInt();
899       buffer.getInt();
900       ByteBuffer keyBuff = buffer.slice();
901       keyBuff.limit(klen);
902       keyBuff.rewind();
903       return keyBuff;
904     }
905 
906     @Override
907     public String getKeyString() {
908       return Bytes.toStringBinary(blockBuffer.array(),
909           blockBuffer.arrayOffset() + blockBuffer.position()
910               + KEY_VALUE_LEN_SIZE, currKeyLen);
911     }
912 
913     @Override
914     public String getValueString() {
915       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
916           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
917           currValueLen);
918     }
919   }
920 
921   /**
922    * ScannerV2 that operates on encoded data blocks.
923    */
924   protected static class EncodedScannerV2 extends AbstractScannerV2 {
925     private DataBlockEncoder.EncodedSeeker seeker = null;
926     private DataBlockEncoder dataBlockEncoder = null;
927     private final boolean includesMemstoreTS;
928 
929     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
930         boolean pread, boolean isCompaction, boolean includesMemstoreTS) {
931       super(reader, cacheBlocks, pread, isCompaction);
932       this.includesMemstoreTS = includesMemstoreTS;
933     }
934 
935     private void setDataBlockEncoder(DataBlockEncoder dataBlockEncoder) {
936       this.dataBlockEncoder = dataBlockEncoder;
937       seeker = dataBlockEncoder.createSeeker(reader.getComparator(),
938           includesMemstoreTS);
939     }
940 
941     /**
942      * Updates the current block to be the given {@link HFileBlock}. Seeks to
943      * the the first key/value pair.
944      *
945      * @param newBlock the block to make current
946      */
947     private void updateCurrentBlock(HFileBlock newBlock) {
948       block = newBlock;
949 
950       // sanity checks
951       if (block.getBlockType() != BlockType.ENCODED_DATA) {
952         throw new IllegalStateException(
953             "EncodedScannerV2 works only on encoded data blocks");
954       }
955 
956       short dataBlockEncoderId = block.getDataBlockEncodingId();
957       if (dataBlockEncoder == null ||
958           !DataBlockEncoding.isCorrectEncoder(dataBlockEncoder,
959               dataBlockEncoderId)) {
960         DataBlockEncoder encoder =
961             DataBlockEncoding.getDataBlockEncoderById(dataBlockEncoderId);
962         setDataBlockEncoder(encoder);
963       }
964 
965       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
966       blockFetches++;
967     }
968 
969     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
970       ByteBuffer origBlock = newBlock.getBufferReadOnly();
971       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
972           origBlock.arrayOffset() + newBlock.headerSize() +
973           DataBlockEncoding.ID_SIZE,
974           newBlock.getUncompressedSizeWithoutHeader() -
975           DataBlockEncoding.ID_SIZE).slice();
976       return encodedBlock;
977     }
978 
979     @Override
980     public boolean seekTo() throws IOException {
981       if (reader == null) {
982         return false;
983       }
984 
985       if (reader.getTrailer().getEntryCount() == 0) {
986         // No data blocks.
987         return false;
988       }
989 
990       long firstDataBlockOffset =
991           reader.getTrailer().getFirstDataBlockOffset();
992       if (block != null && block.getOffset() == firstDataBlockOffset) {
993         seeker.rewind();
994         return true;
995       }
996 
997       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
998           isCompaction, BlockType.DATA);
999       if (block.getOffset() < 0) {
1000         throw new IOException("Invalid block offset: " + block.getOffset());
1001       }
1002       updateCurrentBlock(block);
1003       return true;
1004     }
1005 
1006     @Override
1007     public boolean next() throws IOException {
1008       boolean isValid = seeker.next();
1009       if (!isValid) {
1010         block = readNextDataBlock();
1011         isValid = block != null;
1012         if (isValid) {
1013           updateCurrentBlock(block);
1014         }
1015       }
1016       return isValid;
1017     }
1018 
1019     @Override
1020     public ByteBuffer getKey() {
1021       assertValidSeek();
1022       return seeker.getKeyDeepCopy();
1023     }
1024 
1025     @Override
1026     public ByteBuffer getValue() {
1027       assertValidSeek();
1028       return seeker.getValueShallowCopy();
1029     }
1030 
1031     @Override
1032     public KeyValue getKeyValue() {
1033       if (block == null) {
1034         return null;
1035       }
1036       return seeker.getKeyValue();
1037     }
1038 
1039     @Override
1040     public String getKeyString() {
1041       ByteBuffer keyBuffer = getKey();
1042       return Bytes.toStringBinary(keyBuffer.array(),
1043           keyBuffer.arrayOffset(), keyBuffer.limit());
1044     }
1045 
1046     @Override
1047     public String getValueString() {
1048       ByteBuffer valueBuffer = getValue();
1049       return Bytes.toStringBinary(valueBuffer.array(),
1050           valueBuffer.arrayOffset(), valueBuffer.limit());
1051     }
1052 
1053     private void assertValidSeek() {
1054       if (block == null) {
1055         throw new NotSeekedException();
1056       }
1057     }
1058 
1059     @Override
1060     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1061       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1062     }
1063 
1064     @Override
1065     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1066         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1067         throws IOException  {
1068       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1069         updateCurrentBlock(seekToBlock);
1070       } else if (rewind) {
1071         seeker.rewind();
1072       }
1073       this.nextIndexedKey = nextIndexedKey;
1074       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1075     }
1076   }
1077 
1078   /**
1079    * Returns a buffer with the Bloom filter metadata. The caller takes
1080    * ownership of the buffer.
1081    */
1082   @Override
1083   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1084     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1085   }
1086 
1087   @Override
1088   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1089     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1090   }
1091 
1092   private DataInput getBloomFilterMetadata(BlockType blockType)
1093   throws IOException {
1094     if (blockType != BlockType.GENERAL_BLOOM_META &&
1095         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1096       throw new RuntimeException("Block Type: " + blockType.toString() +
1097           " is not supported") ;
1098     }
1099 
1100     for (HFileBlock b : loadOnOpenBlocks)
1101       if (b.getBlockType() == blockType)
1102         return b.getByteStream();
1103     return null;
1104   }
1105 
1106   @Override
1107   public boolean isFileInfoLoaded() {
1108     return true; // We load file info in constructor in version 2.
1109   }
1110 
1111   /**
1112    * Validates that the minor version is within acceptable limits.
1113    * Otherwise throws an Runtime exception
1114    */
1115   private void validateMinorVersion(Path path, int minorVersion) {
1116     if (minorVersion < MIN_MINOR_VERSION ||
1117         minorVersion > MAX_MINOR_VERSION) {
1118       String msg = "Minor version for path " + path + 
1119                    " is expected to be between " +
1120                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1121                    " but is found to be " + minorVersion;
1122       LOG.error(msg);
1123       throw new RuntimeException(msg);
1124     }
1125   }
1126 }