View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.io.hfile;
20  
21  import java.io.DataInput;
22  import java.io.IOException;
23  import java.nio.ByteBuffer;
24  import java.util.ArrayList;
25  import java.util.List;
26  
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  import org.apache.hadoop.classification.InterfaceAudience;
30  import org.apache.hadoop.fs.Path;
31  import org.apache.hadoop.hbase.HConstants;
32  import org.apache.hadoop.hbase.KeyValue;
33  import org.apache.hadoop.hbase.KeyValue.KVComparator;
34  import org.apache.hadoop.hbase.fs.HFileSystem;
35  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
36  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
37  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
38  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.hbase.util.IdLock;
41  import org.apache.hadoop.io.WritableUtils;
42  import org.cloudera.htrace.Trace;
43  import org.cloudera.htrace.TraceScope;
44  
45  /**
46   * {@link HFile} reader for version 2.
47   */
48  @InterfaceAudience.Private
49  public class HFileReaderV2 extends AbstractHFileReader {
50  
51    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
52  
53    /**
54     * The size of a (key length, value length) tuple that prefixes each entry in
55     * a data block.
56     */
57    private static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
58  
59    private boolean includesMemstoreTS = false;
60    private boolean decodeMemstoreTS = false;
61  
62    private boolean shouldIncludeMemstoreTS() {
63      return includesMemstoreTS;
64    }
65  
66    /** Filesystem-level block reader. */
67    private HFileBlock.FSReader fsBlockReader;
68  
69    /**
70     * A "sparse lock" implementation allowing to lock on a particular block
71     * identified by offset. The purpose of this is to avoid two clients loading
72     * the same block, and have all but one client wait to get the block from the
73     * cache.
74     */
75    private IdLock offsetLock = new IdLock();
76  
77    /**
78     * Blocks read from the load-on-open section, excluding data root index, meta
79     * index, and file info.
80     */
81    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
82  
83    /** Minimum minor version supported by this HFile format */
84    static final int MIN_MINOR_VERSION = 0;
85  
86    /** Maximum minor version supported by this HFile format */
87    // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
88    // the file. This version can read Writables version 1.
89    static final int MAX_MINOR_VERSION = 3;
90  
91    /** Minor versions starting with this number have faked index key */
92    static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
93  
94    /**
95     * Opens a HFile. You must load the index before you can use it by calling
96     * {@link #loadFileInfo()}.
97     *
98     * @param path Path to HFile.
99     * @param trailer File trailer.
100    * @param fsdis input stream.
101    * @param size Length of the stream.
102    * @param cacheConf Cache configuration.
103    * @param preferredEncodingInCache the encoding to use in cache in case we
104    *          have a choice. If the file is already encoded on disk, we will
105    *          still use its on-disk encoding in cache.
106    */
107   public HFileReaderV2(Path path, FixedFileTrailer trailer,
108       final FSDataInputStreamWrapper fsdis, final long size, final CacheConfig cacheConf,
109       final HFileSystem hfs) throws IOException {
110     super(path, trailer, size, cacheConf, hfs);
111     trailer.expectMajorVersion(2);
112     validateMinorVersion(path, trailer.getMinorVersion());
113     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis,
114         compressAlgo, fileSize, trailer.getMinorVersion(), hfs, path);
115     this.fsBlockReader = fsBlockReaderV2; // upcast
116 
117     // Comparator class name is stored in the trailer in version 2.
118     comparator = trailer.createComparator();
119     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
120         trailer.getNumDataIndexLevels(), this);
121     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
122         KeyValue.RAW_COMPARATOR, 1);
123 
124     // Parse load-on-open data.
125 
126     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
127         trailer.getLoadOnOpenDataOffset(),
128         fileSize - trailer.getTrailerSize());
129 
130     // Data index. We also read statistics about the block index written after
131     // the root level.
132     dataBlockIndexReader.readMultiLevelIndexRoot(
133         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
134         trailer.getDataIndexCount());
135 
136     // Meta index.
137     metaBlockIndexReader.readRootIndex(
138         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
139         trailer.getMetaIndexCount());
140 
141     // File info
142     fileInfo = new FileInfo();
143     fileInfo.read(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
144     lastKey = fileInfo.get(FileInfo.LASTKEY);
145     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
146     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
147     byte [] keyValueFormatVersion =
148         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
149     includesMemstoreTS = keyValueFormatVersion != null &&
150         Bytes.toInt(keyValueFormatVersion) ==
151             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
152     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
153     if (includesMemstoreTS) {
154       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
155     }
156 
157     // Read data block encoding algorithm name from file info.
158     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
159     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
160 
161     // Store all other load-on-open blocks for further consumption.
162     HFileBlock b;
163     while ((b = blockIter.nextBlock()) != null) {
164       loadOnOpenBlocks.add(b);
165     }
166   }
167 
168   /**
169    * Create a Scanner on this file. No seeks or reads are done on creation. Call
170    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
171    * nothing to clean up in a Scanner. Letting go of your references to the
172    * scanner is sufficient.
173    *
174    * @param cacheBlocks True if we should cache blocks read in by this scanner.
175    * @param pread Use positional read rather than seek+read if true (pread is
176    *          better for random reads, seek+read is better scanning).
177    * @param isCompaction is scanner being used for a compaction?
178    * @return Scanner on this file.
179    */
180    @Override
181    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
182       final boolean isCompaction) {
183     // check if we want to use data block encoding in memory
184     if (dataBlockEncoder.useEncodedScanner()) {
185       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
186           includesMemstoreTS);
187     }
188 
189     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
190   }
191 
192   /**
193    * @param metaBlockName
194    * @param cacheBlock Add block to cache, if found
195    * @return block wrapped in a ByteBuffer, with header skipped
196    * @throws IOException
197    */
198   @Override
199   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
200       throws IOException {
201     if (trailer.getMetaIndexCount() == 0) {
202       return null; // there are no meta blocks
203     }
204     if (metaBlockIndexReader == null) {
205       throw new IOException("Meta index not loaded");
206     }
207 
208     byte[] mbname = Bytes.toBytes(metaBlockName);
209     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
210         mbname.length);
211     if (block == -1)
212       return null;
213     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
214     long startTimeNs = System.nanoTime();
215 
216     // Per meta key from any given file, synchronize reads for said block. This
217     // is OK to do for meta blocks because the meta block index is always
218     // single-level.
219     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
220       // Check cache for block. If found return.
221       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
222       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
223           DataBlockEncoding.NONE, BlockType.META);
224 
225       cacheBlock &= cacheConf.shouldCacheDataOnRead();
226       if (cacheConf.isBlockCacheEnabled()) {
227         HFileBlock cachedBlock =
228           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false);
229         if (cachedBlock != null) {
230           // Return a distinct 'shallow copy' of the block,
231           // so pos does not get messed by the scanner
232           return cachedBlock.getBufferWithoutHeader();
233         }
234         // Cache Miss, please load.
235       }
236 
237       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
238           blockSize, -1, true);
239 
240       final long delta = System.nanoTime() - startTimeNs;
241       HFile.offerReadLatency(delta, true);
242 
243       // Cache the block
244       if (cacheBlock) {
245         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
246             cacheConf.isInMemory());
247       }
248 
249       return metaBlock.getBufferWithoutHeader();
250     }
251   }
252 
253   /**
254    * Read in a file block.
255    * @param dataBlockOffset offset to read.
256    * @param onDiskBlockSize size of the block
257    * @param cacheBlock
258    * @param pread Use positional read instead of seek+read (positional is
259    *          better doing random reads whereas seek+read is better scanning).
260    * @param isCompaction is this block being read as part of a compaction
261    * @param expectedBlockType the block type we are expecting to read with this
262    *          read operation, or null to read whatever block type is available
263    *          and avoid checking (that might reduce caching efficiency of
264    *          encoded data blocks)
265    * @return Block wrapped in a ByteBuffer.
266    * @throws IOException
267    */
268   @Override
269   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
270       final boolean cacheBlock, boolean pread, final boolean isCompaction,
271       BlockType expectedBlockType)
272       throws IOException {
273     if (dataBlockIndexReader == null) {
274       throw new IOException("Block index not loaded");
275     }
276     if (dataBlockOffset < 0
277         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
278       throw new IOException("Requested block is out of range: "
279           + dataBlockOffset + ", lastDataBlockOffset: "
280           + trailer.getLastDataBlockOffset());
281     }
282     // For any given block from any given file, synchronize reads for said
283     // block.
284     // Without a cache, this synchronizing is needless overhead, but really
285     // the other choice is to duplicate work (which the cache would prevent you
286     // from doing).
287 
288     BlockCacheKey cacheKey =
289         new BlockCacheKey(name, dataBlockOffset,
290             dataBlockEncoder.getDataBlockEncoding(),
291             expectedBlockType);
292 
293     boolean useLock = false;
294     IdLock.Entry lockEntry = null;
295     TraceScope traceScope = Trace.startSpan("HFileReaderV2.readBlock");
296     try {
297       while (true) {
298         if (useLock) {
299           lockEntry = offsetLock.getLockEntry(dataBlockOffset);
300         }
301 
302         // Check cache for block. If found return.
303         if (cacheConf.isBlockCacheEnabled()) {
304           // Try and get the block from the block cache. If the useLock variable is true then this
305           // is the second time through the loop and it should not be counted as a block cache miss.
306           HFileBlock cachedBlock = (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey,
307               cacheBlock, useLock);
308           if (cachedBlock != null) {
309             validateBlockType(cachedBlock, expectedBlockType);
310             if (cachedBlock.getBlockType().isData()) {
311               HFile.dataBlockReadCnt.incrementAndGet();
312 
313               // Validate encoding type for data blocks. We include encoding
314               // type in the cache key, and we expect it to match on a cache hit.
315               if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
316                 throw new IOException("Cached block under key " + cacheKey + " "
317                   + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
318                   + dataBlockEncoder.getDataBlockEncoding() + ")");
319               }
320             }
321             return cachedBlock;
322           }
323           // Carry on, please load.
324         }
325         if (!useLock) {
326           // check cache again with lock
327           useLock = true;
328           continue;
329         }
330         if (Trace.isTracing()) {
331           traceScope.getSpan().addTimelineAnnotation("blockCacheMiss");
332         }
333         // Load block from filesystem.
334         long startTimeNs = System.nanoTime();
335         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, -1,
336             pread);
337         validateBlockType(hfileBlock, expectedBlockType);
338 
339         final long delta = System.nanoTime() - startTimeNs;
340         HFile.offerReadLatency(delta, pread);
341 
342         // Cache the block if necessary
343         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(hfileBlock.getBlockType().getCategory())) {
344           cacheConf.getBlockCache().cacheBlock(cacheKey, hfileBlock, cacheConf.isInMemory());
345         }
346 
347         if (hfileBlock.getBlockType().isData()) {
348           HFile.dataBlockReadCnt.incrementAndGet();
349         }
350 
351         return hfileBlock;
352       }
353     } finally {
354       traceScope.close();
355       if (lockEntry != null) {
356         offsetLock.releaseLockEntry(lockEntry);
357       }
358     }
359   }
360 
361   @Override
362   public boolean hasMVCCInfo() {
363     return includesMemstoreTS && decodeMemstoreTS;
364   }
365 
366   /**
367    * Compares the actual type of a block retrieved from cache or disk with its
368    * expected type and throws an exception in case of a mismatch. Expected
369    * block type of {@link BlockType#DATA} is considered to match the actual
370    * block type [@link {@link BlockType#ENCODED_DATA} as well.
371    * @param block a block retrieved from cache or disk
372    * @param expectedBlockType the expected block type, or null to skip the
373    *          check
374    */
375   private void validateBlockType(HFileBlock block,
376       BlockType expectedBlockType) throws IOException {
377     if (expectedBlockType == null) {
378       return;
379     }
380     BlockType actualBlockType = block.getBlockType();
381     if (actualBlockType == BlockType.ENCODED_DATA &&
382         expectedBlockType == BlockType.DATA) {
383       // We consider DATA to match ENCODED_DATA for the purpose of this
384       // verification.
385       return;
386     }
387     if (actualBlockType != expectedBlockType) {
388       throw new IOException("Expected block type " + expectedBlockType + ", " +
389           "but got " + actualBlockType + ": " + block);
390     }
391   }
392 
393   /**
394    * @return Last key in the file. May be null if file has no entries. Note that
395    *         this is not the last row key, but rather the byte form of the last
396    *         KeyValue.
397    */
398   @Override
399   public byte[] getLastKey() {
400     return dataBlockIndexReader.isEmpty() ? null : lastKey;
401   }
402 
403   /**
404    * @return Midkey for this file. We work with block boundaries only so
405    *         returned midkey is an approximation only.
406    * @throws IOException
407    */
408   @Override
409   public byte[] midkey() throws IOException {
410     return dataBlockIndexReader.midkey();
411   }
412 
413   @Override
414   public void close() throws IOException {
415     close(cacheConf.shouldEvictOnClose());
416   }
417 
418   public void close(boolean evictOnClose) throws IOException {
419     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
420       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
421       if (LOG.isTraceEnabled()) {
422         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
423           + " block(s)");
424       }
425     }
426     fsBlockReader.closeStreams();
427   }
428 
429   /** For testing */
430   @Override
431   HFileBlock.FSReader getUncachedBlockReader() {
432     return fsBlockReader;
433   }
434 
435 
436   protected abstract static class AbstractScannerV2
437       extends AbstractHFileReader.Scanner {
438     protected HFileBlock block;
439 
440     /**
441      * The next indexed key is to keep track of the indexed key of the next data block.
442      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
443      * current data block is the last data block.
444      *
445      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
446      */
447     protected byte[] nextIndexedKey;
448 
449     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
450         final boolean pread, final boolean isCompaction) {
451       super(r, cacheBlocks, pread, isCompaction);
452     }
453 
454     /**
455      * An internal API function. Seek to the given key, optionally rewinding to
456      * the first key of the block before doing the seek.
457      *
458      * @param key key byte array
459      * @param offset key offset in the key byte array
460      * @param length key length
461      * @param rewind whether to rewind to the first key of the block before
462      *        doing the seek. If this is false, we are assuming we never go
463      *        back, otherwise the result is undefined.
464      * @return -1 if the key is earlier than the first key of the file,
465      *         0 if we are at the given key, 1 if we are past the given key
466      *         -2 if the key is earlier than the first key of the file while
467      *         using a faked index key
468      * @throws IOException
469      */
470     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
471         throws IOException {
472       HFileBlockIndex.BlockIndexReader indexReader =
473           reader.getDataBlockIndexReader();
474       BlockWithScanInfo blockWithScanInfo =
475         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
476             cacheBlocks, pread, isCompaction);
477       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
478         // This happens if the key e.g. falls before the beginning of the file.
479         return -1;
480       }
481       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
482           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
483     }
484 
485     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
486 
487     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
488         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
489         throws IOException;
490 
491     @Override
492     public int seekTo(byte[] key, int offset, int length) throws IOException {
493       // Always rewind to the first key of the block, because the given key
494       // might be before or after the current key.
495       return seekTo(key, offset, length, true);
496     }
497 
498     @Override
499     public int reseekTo(byte[] key, int offset, int length) throws IOException {
500       int compared;
501       if (isSeeked()) {
502         compared = compareKey(reader.getComparator(), key, offset, length);
503         if (compared < 1) {
504           // If the required key is less than or equal to current key, then
505           // don't do anything.
506           return compared;
507         } else {
508           if (this.nextIndexedKey != null &&
509               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
510                reader.getComparator().compareFlatKey(key, offset, length,
511                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
512             // The reader shall continue to scan the current data block instead of querying the
513             // block index as long as it knows the target key is strictly smaller than
514             // the next indexed key or the current data block is the last data block.
515             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
516                 false, key, offset, length, false);
517           }
518         }
519       }
520       // Don't rewind on a reseek operation, because reseek implies that we are
521       // always going forward in the file.
522       return seekTo(key, offset, length, false);
523     }
524 
525     @Override
526     public boolean seekBefore(byte[] key, int offset, int length)
527         throws IOException {
528       HFileBlock seekToBlock =
529           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
530               block, cacheBlocks, pread, isCompaction);
531       if (seekToBlock == null) {
532         return false;
533       }
534       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
535 
536       if (reader.getComparator().compareFlatKey(firstKey.array(),
537           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) == 0)
538       {
539         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
540         // The key we are interested in
541         if (previousBlockOffset == -1) {
542           // we have a 'problem', the key we want is the first of the file.
543           return false;
544         }
545 
546         // It is important that we compute and pass onDiskSize to the block
547         // reader so that it does not have to read the header separately to
548         // figure out the size.
549         seekToBlock = reader.readBlock(previousBlockOffset,
550             seekToBlock.getOffset() - previousBlockOffset, cacheBlocks,
551             pread, isCompaction, BlockType.DATA);
552         // TODO shortcut: seek forward in this block to the last key of the
553         // block.
554       }
555       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
556       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
557       return true;
558     }
559 
560 
561     /**
562      * Scans blocks in the "scanned" section of the {@link HFile} until the next
563      * data block is found.
564      *
565      * @return the next block, or null if there are no more data blocks
566      * @throws IOException
567      */
568     protected HFileBlock readNextDataBlock() throws IOException {
569       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
570       if (block == null)
571         return null;
572 
573       HFileBlock curBlock = block;
574 
575       do {
576         if (curBlock.getOffset() >= lastDataBlockOffset)
577           return null;
578 
579         if (curBlock.getOffset() < 0) {
580           throw new IOException("Invalid block file offset: " + block);
581         }
582 
583         // We are reading the next block without block type validation, because
584         // it might turn out to be a non-data block.
585         curBlock = reader.readBlock(curBlock.getOffset()
586             + curBlock.getOnDiskSizeWithHeader(),
587             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
588             isCompaction, null);
589       } while (!curBlock.getBlockType().isData());
590 
591       return curBlock;
592     }
593 
594     @Override
595     public boolean isSeeked(){
596       return this.block != null;
597     }
598 
599     /**
600      * Compare the given key against the current key
601      * @param comparator
602      * @param key
603      * @param offset
604      * @param length
605      * @return -1 is the passed key is smaller than the current key, 0 if equal and 1 if greater
606      */
607     public abstract int compareKey(KVComparator comparator, byte[] key, int offset,
608         int length);
609   }
610 
611   /**
612    * Implementation of {@link HFileScanner} interface.
613    */
614   protected static class ScannerV2 extends AbstractScannerV2 {
615     private HFileReaderV2 reader;
616 
617     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
618         final boolean pread, final boolean isCompaction) {
619       super(r, cacheBlocks, pread, isCompaction);
620       this.reader = r;
621     }
622 
623     @Override
624     public KeyValue getKeyValue() {
625       if (!isSeeked())
626         return null;
627 
628       KeyValue ret = new KeyValue(blockBuffer.array(),
629           blockBuffer.arrayOffset() + blockBuffer.position(),
630           KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen);
631       if (this.reader.shouldIncludeMemstoreTS()) {
632         ret.setMvccVersion(currMemstoreTS);
633       }
634       return ret;
635     }
636 
637     @Override
638     public ByteBuffer getKey() {
639       assertSeeked();
640       return ByteBuffer.wrap(
641           blockBuffer.array(),
642           blockBuffer.arrayOffset() + blockBuffer.position()
643               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
644     }
645 
646     @Override
647     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
648       return comparator.compareFlatKey(key, offset, length, blockBuffer.array(),
649           blockBuffer.arrayOffset() + blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen);
650     }
651 
652     @Override
653     public ByteBuffer getValue() {
654       assertSeeked();
655       return ByteBuffer.wrap(
656           blockBuffer.array(),
657           blockBuffer.arrayOffset() + blockBuffer.position()
658               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
659     }
660 
661     private void setNonSeekedState() {
662       block = null;
663       blockBuffer = null;
664       currKeyLen = 0;
665       currValueLen = 0;
666       currMemstoreTS = 0;
667       currMemstoreTSLen = 0;
668     }
669 
670     /**
671      * Go to the next key/value in the block section. Loads the next block if
672      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
673      * be called.
674      *
675      * @return true if successfully navigated to the next key/value
676      */
677     @Override
678     public boolean next() throws IOException {
679       assertSeeked();
680 
681       try {
682         blockBuffer.position(blockBuffer.position() + KEY_VALUE_LEN_SIZE
683             + currKeyLen + currValueLen + currMemstoreTSLen);
684       } catch (IllegalArgumentException e) {
685         LOG.error("Current pos = " + blockBuffer.position()
686             + "; currKeyLen = " + currKeyLen + "; currValLen = "
687             + currValueLen + "; block limit = " + blockBuffer.limit()
688             + "; HFile name = " + reader.getName()
689             + "; currBlock currBlockOffset = " + block.getOffset());
690         throw e;
691       }
692 
693       if (blockBuffer.remaining() <= 0) {
694         long lastDataBlockOffset =
695             reader.getTrailer().getLastDataBlockOffset();
696 
697         if (block.getOffset() >= lastDataBlockOffset) {
698           setNonSeekedState();
699           return false;
700         }
701 
702         // read the next block
703         HFileBlock nextBlock = readNextDataBlock();
704         if (nextBlock == null) {
705           setNonSeekedState();
706           return false;
707         }
708 
709         updateCurrBlock(nextBlock);
710         return true;
711       }
712 
713       // We are still in the same block.
714       readKeyValueLen();
715       return true;
716     }
717 
718     /**
719      * Positions this scanner at the start of the file.
720      *
721      * @return false if empty file; i.e. a call to next would return false and
722      *         the current key and value are undefined.
723      * @throws IOException
724      */
725     @Override
726     public boolean seekTo() throws IOException {
727       if (reader == null) {
728         return false;
729       }
730 
731       if (reader.getTrailer().getEntryCount() == 0) {
732         // No data blocks.
733         return false;
734       }
735 
736       long firstDataBlockOffset =
737           reader.getTrailer().getFirstDataBlockOffset();
738       if (block != null && block.getOffset() == firstDataBlockOffset) {
739         blockBuffer.rewind();
740         readKeyValueLen();
741         return true;
742       }
743 
744       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
745           isCompaction, BlockType.DATA);
746       if (block.getOffset() < 0) {
747         throw new IOException("Invalid block offset: " + block.getOffset());
748       }
749       updateCurrBlock(block);
750       return true;
751     }
752 
753     @Override
754     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
755         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
756         throws IOException {
757       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
758         updateCurrBlock(seekToBlock);
759       } else if (rewind) {
760         blockBuffer.rewind();
761       }
762 
763       // Update the nextIndexedKey
764       this.nextIndexedKey = nextIndexedKey;
765       return blockSeek(key, offset, length, seekBefore);
766     }
767 
768     /**
769      * Updates the current block to be the given {@link HFileBlock}. Seeks to
770      * the the first key/value pair.
771      *
772      * @param newBlock the block to make current
773      */
774     private void updateCurrBlock(HFileBlock newBlock) {
775       block = newBlock;
776 
777       // sanity check
778       if (block.getBlockType() != BlockType.DATA) {
779         throw new IllegalStateException("ScannerV2 works only on data " +
780             "blocks, got " + block.getBlockType() + "; " +
781             "fileName=" + reader.name + ", " +
782             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
783             "isCompaction=" + isCompaction);
784       }
785 
786       blockBuffer = block.getBufferWithoutHeader();
787       readKeyValueLen();
788       blockFetches++;
789 
790       // Reset the next indexed key
791       this.nextIndexedKey = null;
792     }
793 
794     private final void readKeyValueLen() {
795       blockBuffer.mark();
796       currKeyLen = blockBuffer.getInt();
797       currValueLen = blockBuffer.getInt();
798       blockBuffer.reset();
799       if (this.reader.shouldIncludeMemstoreTS()) {
800         if (this.reader.decodeMemstoreTS) {
801           try {
802             int memstoreTSOffset = blockBuffer.arrayOffset()
803                 + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen
804                 + currValueLen;
805             currMemstoreTS = Bytes.readVLong(blockBuffer.array(),
806                 memstoreTSOffset);
807             currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
808           } catch (Exception e) {
809             throw new RuntimeException("Error reading memstore timestamp", e);
810           }
811         } else {
812           currMemstoreTS = 0;
813           currMemstoreTSLen = 1;
814         }
815       }
816 
817       if (currKeyLen < 0 || currValueLen < 0
818           || currKeyLen > blockBuffer.limit()
819           || currValueLen > blockBuffer.limit()) {
820         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
821             + " or currValueLen " + currValueLen + ". Block offset: "
822             + block.getOffset() + ", block length: " + blockBuffer.limit()
823             + ", position: " + blockBuffer.position() + " (without header).");
824       }
825     }
826 
827     /**
828      * Within a loaded block, seek looking for the last key that is smaller
829      * than (or equal to?) the key we are interested in.
830      *
831      * A note on the seekBefore: if you have seekBefore = true, AND the first
832      * key in the block = key, then you'll get thrown exceptions. The caller has
833      * to check for that case and load the previous block as appropriate.
834      *
835      * @param key the key to find
836      * @param seekBefore find the key before the given key in case of exact
837      *          match.
838      * @return 0 in case of an exact key match, 1 in case of an inexact match,
839      *         -2 in case of an inexact match and furthermore, the input key less
840      *         than the first key of current block(e.g. using a faked index key)
841      */
842     private int blockSeek(byte[] key, int offset, int length,
843         boolean seekBefore) {
844       int klen, vlen;
845       long memstoreTS = 0;
846       int memstoreTSLen = 0;
847       int lastKeyValueSize = -1;
848       do {
849         blockBuffer.mark();
850         klen = blockBuffer.getInt();
851         vlen = blockBuffer.getInt();
852         blockBuffer.reset();
853         if (this.reader.shouldIncludeMemstoreTS()) {
854           if (this.reader.decodeMemstoreTS) {
855             try {
856               int memstoreTSOffset = blockBuffer.arrayOffset()
857                   + blockBuffer.position() + KEY_VALUE_LEN_SIZE + klen + vlen;
858               memstoreTS = Bytes.readVLong(blockBuffer.array(),
859                   memstoreTSOffset);
860               memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
861             } catch (Exception e) {
862               throw new RuntimeException("Error reading memstore timestamp", e);
863             }
864           } else {
865             memstoreTS = 0;
866             memstoreTSLen = 1;
867           }
868         }
869 
870         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
871             + KEY_VALUE_LEN_SIZE;
872         int comp = reader.getComparator().compareFlatKey(key, offset, length,
873             blockBuffer.array(), keyOffset, klen);
874 
875         if (comp == 0) {
876           if (seekBefore) {
877             if (lastKeyValueSize < 0) {
878               throw new IllegalStateException("blockSeek with seekBefore "
879                   + "at the first key of the block: key="
880                   + Bytes.toStringBinary(key) + ", blockOffset="
881                   + block.getOffset() + ", onDiskSize="
882                   + block.getOnDiskSizeWithHeader());
883             }
884             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
885             readKeyValueLen();
886             return 1; // non exact match.
887           }
888           currKeyLen = klen;
889           currValueLen = vlen;
890           if (this.reader.shouldIncludeMemstoreTS()) {
891             currMemstoreTS = memstoreTS;
892             currMemstoreTSLen = memstoreTSLen;
893           }
894           return 0; // indicate exact match
895         } else if (comp < 0) {
896           if (lastKeyValueSize > 0)
897             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
898           readKeyValueLen();
899           if (lastKeyValueSize == -1 && blockBuffer.position() == 0
900               && this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY) {
901             return HConstants.INDEX_KEY_MAGIC;
902           }
903           return 1;
904         }
905 
906         // The size of this key/value tuple, including key/value length fields.
907         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
908         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
909       } while (blockBuffer.remaining() > 0);
910 
911       // Seek to the last key we successfully read. This will happen if this is
912       // the last key/value pair in the file, in which case the following call
913       // to next() has to return false.
914       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
915       readKeyValueLen();
916       return 1; // didn't exactly find it.
917     }
918 
919     @Override
920     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
921       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
922       // It is safe to manipulate this buffer because we own the buffer object.
923       buffer.rewind();
924       int klen = buffer.getInt();
925       buffer.getInt();
926       ByteBuffer keyBuff = buffer.slice();
927       keyBuff.limit(klen);
928       keyBuff.rewind();
929       return keyBuff;
930     }
931 
932     @Override
933     public String getKeyString() {
934       return Bytes.toStringBinary(blockBuffer.array(),
935           blockBuffer.arrayOffset() + blockBuffer.position()
936               + KEY_VALUE_LEN_SIZE, currKeyLen);
937     }
938 
939     @Override
940     public String getValueString() {
941       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
942           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
943           currValueLen);
944     }
945   }
946 
947   /**
948    * ScannerV2 that operates on encoded data blocks.
949    */
950   protected static class EncodedScannerV2 extends AbstractScannerV2 {
951     private DataBlockEncoder.EncodedSeeker seeker = null;
952     private DataBlockEncoder dataBlockEncoder = null;
953     private final boolean includesMemstoreTS;
954 
955     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
956         boolean pread, boolean isCompaction, boolean includesMemstoreTS) {
957       super(reader, cacheBlocks, pread, isCompaction);
958       this.includesMemstoreTS = includesMemstoreTS;
959     }
960 
961     private void setDataBlockEncoder(DataBlockEncoder dataBlockEncoder) {
962       this.dataBlockEncoder = dataBlockEncoder;
963       seeker = dataBlockEncoder.createSeeker(reader.getComparator(),
964           includesMemstoreTS);
965     }
966 
967     /**
968      * Updates the current block to be the given {@link HFileBlock}. Seeks to
969      * the the first key/value pair.
970      *
971      * @param newBlock the block to make current
972      */
973     private void updateCurrentBlock(HFileBlock newBlock) {
974       block = newBlock;
975 
976       // sanity checks
977       if (block.getBlockType() != BlockType.ENCODED_DATA) {
978         throw new IllegalStateException(
979             "EncodedScannerV2 works only on encoded data blocks");
980       }
981 
982       short dataBlockEncoderId = block.getDataBlockEncodingId();
983       if (dataBlockEncoder == null ||
984           !DataBlockEncoding.isCorrectEncoder(dataBlockEncoder,
985               dataBlockEncoderId)) {
986         DataBlockEncoder encoder =
987             DataBlockEncoding.getDataBlockEncoderById(dataBlockEncoderId);
988         setDataBlockEncoder(encoder);
989       }
990 
991       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
992       blockFetches++;
993 
994       // Reset the next indexed key
995       this.nextIndexedKey = null;
996     }
997 
998     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
999       ByteBuffer origBlock = newBlock.getBufferReadOnly();
1000       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
1001           origBlock.arrayOffset() + newBlock.headerSize() +
1002           DataBlockEncoding.ID_SIZE,
1003           newBlock.getUncompressedSizeWithoutHeader() -
1004           DataBlockEncoding.ID_SIZE).slice();
1005       return encodedBlock;
1006     }
1007 
1008     @Override
1009     public boolean seekTo() throws IOException {
1010       if (reader == null) {
1011         return false;
1012       }
1013 
1014       if (reader.getTrailer().getEntryCount() == 0) {
1015         // No data blocks.
1016         return false;
1017       }
1018 
1019       long firstDataBlockOffset =
1020           reader.getTrailer().getFirstDataBlockOffset();
1021       if (block != null && block.getOffset() == firstDataBlockOffset) {
1022         seeker.rewind();
1023         return true;
1024       }
1025 
1026       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1027           isCompaction, BlockType.DATA);
1028       if (block.getOffset() < 0) {
1029         throw new IOException("Invalid block offset: " + block.getOffset());
1030       }
1031       updateCurrentBlock(block);
1032       return true;
1033     }
1034 
1035     @Override
1036     public boolean next() throws IOException {
1037       boolean isValid = seeker.next();
1038       if (!isValid) {
1039         block = readNextDataBlock();
1040         isValid = block != null;
1041         if (isValid) {
1042           updateCurrentBlock(block);
1043         }
1044       }
1045       return isValid;
1046     }
1047 
1048     @Override
1049     public ByteBuffer getKey() {
1050       assertValidSeek();
1051       return seeker.getKeyDeepCopy();
1052     }
1053 
1054     @Override
1055     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
1056       return seeker.compareKey(comparator, key, offset, length);
1057     }
1058 
1059     @Override
1060     public ByteBuffer getValue() {
1061       assertValidSeek();
1062       return seeker.getValueShallowCopy();
1063     }
1064 
1065     @Override
1066     public KeyValue getKeyValue() {
1067       if (block == null) {
1068         return null;
1069       }
1070       return seeker.getKeyValue();
1071     }
1072 
1073     @Override
1074     public String getKeyString() {
1075       ByteBuffer keyBuffer = getKey();
1076       return Bytes.toStringBinary(keyBuffer.array(),
1077           keyBuffer.arrayOffset(), keyBuffer.limit());
1078     }
1079 
1080     @Override
1081     public String getValueString() {
1082       ByteBuffer valueBuffer = getValue();
1083       return Bytes.toStringBinary(valueBuffer.array(),
1084           valueBuffer.arrayOffset(), valueBuffer.limit());
1085     }
1086 
1087     private void assertValidSeek() {
1088       if (block == null) {
1089         throw new NotSeekedException();
1090       }
1091     }
1092 
1093     @Override
1094     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1095       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1096     }
1097 
1098     @Override
1099     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1100         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1101         throws IOException  {
1102       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1103         updateCurrentBlock(seekToBlock);
1104       } else if (rewind) {
1105         seeker.rewind();
1106       }
1107       this.nextIndexedKey = nextIndexedKey;
1108       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1109     }
1110   }
1111 
1112   /**
1113    * Returns a buffer with the Bloom filter metadata. The caller takes
1114    * ownership of the buffer.
1115    */
1116   @Override
1117   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1118     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1119   }
1120 
1121   @Override
1122   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1123     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1124   }
1125 
1126   private DataInput getBloomFilterMetadata(BlockType blockType)
1127   throws IOException {
1128     if (blockType != BlockType.GENERAL_BLOOM_META &&
1129         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1130       throw new RuntimeException("Block Type: " + blockType.toString() +
1131           " is not supported") ;
1132     }
1133 
1134     for (HFileBlock b : loadOnOpenBlocks)
1135       if (b.getBlockType() == blockType)
1136         return b.getByteStream();
1137     return null;
1138   }
1139 
1140   @Override
1141   public boolean isFileInfoLoaded() {
1142     return true; // We load file info in constructor in version 2.
1143   }
1144 
1145   /**
1146    * Validates that the minor version is within acceptable limits.
1147    * Otherwise throws an Runtime exception
1148    */
1149   private void validateMinorVersion(Path path, int minorVersion) {
1150     if (minorVersion < MIN_MINOR_VERSION ||
1151         minorVersion > MAX_MINOR_VERSION) {
1152       String msg = "Minor version for path " + path + 
1153                    " is expected to be between " +
1154                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1155                    " but is found to be " + minorVersion;
1156       LOG.error(msg);
1157       throw new RuntimeException(msg);
1158     }
1159   }
1160 }