View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.io.hfile;
20  
21  import java.io.DataInput;
22  import java.io.IOException;
23  import java.nio.ByteBuffer;
24  import java.util.ArrayList;
25  import java.util.List;
26  
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  import org.apache.hadoop.classification.InterfaceAudience;
30  import org.apache.hadoop.fs.FSDataInputStream;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.KeyValue;
34  import org.apache.hadoop.hbase.fs.HFileSystem;
35  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
36  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
37  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
38  import org.apache.hadoop.hbase.util.Bytes;
39  import org.apache.hadoop.hbase.util.IdLock;
40  import org.apache.hadoop.io.WritableUtils;
41  
42  /**
43   * {@link HFile} reader for version 2.
44   */
45  @InterfaceAudience.Private
46  public class HFileReaderV2 extends AbstractHFileReader {
47  
48    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
49  
50    /**
51     * The size of a (key length, value length) tuple that prefixes each entry in
52     * a data block.
53     */
54    private static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
55  
56    private boolean includesMemstoreTS = false;
57    private boolean decodeMemstoreTS = false;
58  
59    private boolean shouldIncludeMemstoreTS() {
60      return includesMemstoreTS;
61    }
62  
63    /**
64     * A "sparse lock" implementation allowing to lock on a particular block
65     * identified by offset. The purpose of this is to avoid two clients loading
66     * the same block, and have all but one client wait to get the block from the
67     * cache.
68     */
69    private IdLock offsetLock = new IdLock();
70  
71    /**
72     * Blocks read from the load-on-open section, excluding data root index, meta
73     * index, and file info.
74     */
75    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
76  
77    /** Minimum minor version supported by this HFile format */
78    static final int MIN_MINOR_VERSION = 0;
79  
80    /** Maximum minor version supported by this HFile format */
81    // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
82    // the file. This version can read Writables version 1.
83    static final int MAX_MINOR_VERSION = 3;
84  
85    /** Minor versions starting with this number have faked index key */
86    static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
87  
88    /**
89     * Opens a HFile. You must load the index before you can use it by calling
90     * {@link #loadFileInfo()}.
91     *
92     * @param path Path to HFile.
93     * @param trailer File trailer.
94     * @param fsdis input stream. Caller is responsible for closing the passed
95     *          stream.
96     * @param size Length of the stream.
97     * @param closeIStream Whether to close the stream.
98     * @param cacheConf Cache configuration.
99     * @param preferredEncodingInCache the encoding to use in cache in case we
100    *          have a choice. If the file is already encoded on disk, we will
101    *          still use its on-disk encoding in cache.
102    */
103   public HFileReaderV2(Path path, FixedFileTrailer trailer,
104       final FSDataInputStream fsdis, final FSDataInputStream fsdisNoFsChecksum,
105       final long size,
106       final boolean closeIStream, final CacheConfig cacheConf,
107       DataBlockEncoding preferredEncodingInCache, final HFileSystem hfs)
108       throws IOException {
109     super(path, trailer, fsdis, fsdisNoFsChecksum, size, 
110           closeIStream, cacheConf, hfs);
111     trailer.expectMajorVersion(2);
112     validateMinorVersion(path, trailer.getMinorVersion());
113     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis,
114         fsdisNoFsChecksum,
115         compressAlgo, fileSize, trailer.getMinorVersion(), hfs, path);
116     this.fsBlockReader = fsBlockReaderV2; // upcast
117 
118     // Comparator class name is stored in the trailer in version 2.
119     comparator = trailer.createComparator();
120     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
121         trailer.getNumDataIndexLevels(), this);
122     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
123         Bytes.BYTES_RAWCOMPARATOR, 1);
124 
125     // Parse load-on-open data.
126 
127     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
128         trailer.getLoadOnOpenDataOffset(),
129         fileSize - trailer.getTrailerSize());
130 
131     // Data index. We also read statistics about the block index written after
132     // the root level.
133     dataBlockIndexReader.readMultiLevelIndexRoot(
134         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
135         trailer.getDataIndexCount());
136 
137     // Meta index.
138     metaBlockIndexReader.readRootIndex(
139         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
140         trailer.getMetaIndexCount());
141 
142     // File info
143     fileInfo = new FileInfo();
144     fileInfo.read(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
145     lastKey = fileInfo.get(FileInfo.LASTKEY);
146     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
147     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
148     byte [] keyValueFormatVersion =
149         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
150     includesMemstoreTS = keyValueFormatVersion != null &&
151         Bytes.toInt(keyValueFormatVersion) ==
152             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
153     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
154     if (includesMemstoreTS) {
155       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
156     }
157 
158     // Read data block encoding algorithm name from file info.
159     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo,
160         preferredEncodingInCache);
161     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
162 
163     // Store all other load-on-open blocks for further consumption.
164     HFileBlock b;
165     while ((b = blockIter.nextBlock()) != null) {
166       loadOnOpenBlocks.add(b);
167     }
168   }
169 
170   /**
171    * Create a Scanner on this file. No seeks or reads are done on creation. Call
172    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
173    * nothing to clean up in a Scanner. Letting go of your references to the
174    * scanner is sufficient.
175    *
176    * @param cacheBlocks True if we should cache blocks read in by this scanner.
177    * @param pread Use positional read rather than seek+read if true (pread is
178    *          better for random reads, seek+read is better scanning).
179    * @param isCompaction is scanner being used for a compaction?
180    * @return Scanner on this file.
181    */
182    @Override
183    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
184       final boolean isCompaction) {
185     // check if we want to use data block encoding in memory
186     if (dataBlockEncoder.useEncodedScanner(isCompaction)) {
187       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
188           includesMemstoreTS);
189     }
190 
191     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
192   }
193 
194   /**
195    * @param metaBlockName
196    * @param cacheBlock Add block to cache, if found
197    * @return block wrapped in a ByteBuffer, with header skipped
198    * @throws IOException
199    */
200   @Override
201   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
202       throws IOException {
203     if (trailer.getMetaIndexCount() == 0) {
204       return null; // there are no meta blocks
205     }
206     if (metaBlockIndexReader == null) {
207       throw new IOException("Meta index not loaded");
208     }
209 
210     byte[] mbname = Bytes.toBytes(metaBlockName);
211     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
212         mbname.length);
213     if (block == -1)
214       return null;
215     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
216     long startTimeNs = System.nanoTime();
217 
218     // Per meta key from any given file, synchronize reads for said block. This
219     // is OK to do for meta blocks because the meta block index is always
220     // single-level.
221     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
222       // Check cache for block. If found return.
223       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
224       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
225           DataBlockEncoding.NONE, BlockType.META);
226 
227       cacheBlock &= cacheConf.shouldCacheDataOnRead();
228       if (cacheConf.isBlockCacheEnabled()) {
229         HFileBlock cachedBlock =
230           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false);
231         if (cachedBlock != null) {
232           // Return a distinct 'shallow copy' of the block,
233           // so pos does not get messed by the scanner
234           return cachedBlock.getBufferWithoutHeader();
235         }
236         // Cache Miss, please load.
237       }
238 
239       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
240           blockSize, -1, true);
241 
242       final long delta = System.nanoTime() - startTimeNs;
243       HFile.offerReadLatency(delta, true);
244 
245       // Cache the block
246       if (cacheBlock) {
247         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
248             cacheConf.isInMemory());
249       }
250 
251       return metaBlock.getBufferWithoutHeader();
252     }
253   }
254 
255   /**
256    * Read in a file block.
257    * @param dataBlockOffset offset to read.
258    * @param onDiskBlockSize size of the block
259    * @param cacheBlock
260    * @param pread Use positional read instead of seek+read (positional is
261    *          better doing random reads whereas seek+read is better scanning).
262    * @param isCompaction is this block being read as part of a compaction
263    * @param expectedBlockType the block type we are expecting to read with this
264    *          read operation, or null to read whatever block type is available
265    *          and avoid checking (that might reduce caching efficiency of
266    *          encoded data blocks)
267    * @return Block wrapped in a ByteBuffer.
268    * @throws IOException
269    */
270   @Override
271   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
272       final boolean cacheBlock, boolean pread, final boolean isCompaction,
273       BlockType expectedBlockType)
274       throws IOException {
275     if (dataBlockIndexReader == null) {
276       throw new IOException("Block index not loaded");
277     }
278     if (dataBlockOffset < 0
279         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
280       throw new IOException("Requested block is out of range: "
281           + dataBlockOffset + ", lastDataBlockOffset: "
282           + trailer.getLastDataBlockOffset());
283     }
284     // For any given block from any given file, synchronize reads for said
285     // block.
286     // Without a cache, this synchronizing is needless overhead, but really
287     // the other choice is to duplicate work (which the cache would prevent you
288     // from doing).
289 
290     BlockCacheKey cacheKey =
291         new BlockCacheKey(name, dataBlockOffset,
292             dataBlockEncoder.getEffectiveEncodingInCache(isCompaction),
293             expectedBlockType);
294 
295     boolean useLock = false;
296     IdLock.Entry lockEntry = null;
297     try {
298       while (true) {
299 
300         if (useLock) {
301           lockEntry = offsetLock.getLockEntry(dataBlockOffset);
302         }
303 
304         // Check cache for block. If found return.
305         if (cacheConf.isBlockCacheEnabled()) {
306           // Try and get the block from the block cache. If the useLock variable is true then this
307           // is the second time through the loop and it should not be counted as a block cache miss.
308           HFileBlock cachedBlock = (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey,
309               cacheBlock, useLock);
310           if (cachedBlock != null) {
311             if (cachedBlock.getBlockType() == BlockType.DATA) {
312               HFile.dataBlockReadCnt.incrementAndGet();
313             }
314 
315             validateBlockType(cachedBlock, expectedBlockType);
316 
317             // Validate encoding type for encoded blocks. We include encoding
318             // type in the cache key, and we expect it to match on a cache hit.
319             if (cachedBlock.getBlockType() == BlockType.ENCODED_DATA
320                 && cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getEncodingInCache()) {
321               throw new IOException("Cached block under key " + cacheKey + " "
322                   + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
323                   + dataBlockEncoder.getEncodingInCache() + ")");
324             }
325             return cachedBlock;
326           }
327           // Carry on, please load.
328         }
329         if (!useLock) {
330           // check cache again with lock
331           useLock = true;
332           continue;
333         }
334 
335         // Load block from filesystem.
336         long startTimeNs = System.nanoTime();
337         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, -1,
338             pread);
339         hfileBlock = dataBlockEncoder.diskToCacheFormat(hfileBlock, isCompaction);
340         validateBlockType(hfileBlock, expectedBlockType);
341 
342         final long delta = System.nanoTime() - startTimeNs;
343         HFile.offerReadLatency(delta, pread);
344 
345         // Cache the block if necessary
346         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(hfileBlock.getBlockType().getCategory())) {
347           cacheConf.getBlockCache().cacheBlock(cacheKey, hfileBlock, cacheConf.isInMemory());
348         }
349 
350         if (hfileBlock.getBlockType() == BlockType.DATA) {
351           HFile.dataBlockReadCnt.incrementAndGet();
352         }
353 
354         return hfileBlock;
355       }
356     } finally {
357       if (lockEntry != null) {
358         offsetLock.releaseLockEntry(lockEntry);
359       }
360     }
361   }
362 
363   /**
364    * Compares the actual type of a block retrieved from cache or disk with its
365    * expected type and throws an exception in case of a mismatch. Expected
366    * block type of {@link BlockType#DATA} is considered to match the actual
367    * block type [@link {@link BlockType#ENCODED_DATA} as well.
368    * @param block a block retrieved from cache or disk
369    * @param expectedBlockType the expected block type, or null to skip the
370    *          check
371    */
372   private void validateBlockType(HFileBlock block,
373       BlockType expectedBlockType) throws IOException {
374     if (expectedBlockType == null) {
375       return;
376     }
377     BlockType actualBlockType = block.getBlockType();
378     if (actualBlockType == BlockType.ENCODED_DATA &&
379         expectedBlockType == BlockType.DATA) {
380       // We consider DATA to match ENCODED_DATA for the purpose of this
381       // verification.
382       return;
383     }
384     if (actualBlockType != expectedBlockType) {
385       throw new IOException("Expected block type " + expectedBlockType + ", " +
386           "but got " + actualBlockType + ": " + block);
387     }
388   }
389 
390   /**
391    * @return Last key in the file. May be null if file has no entries. Note that
392    *         this is not the last row key, but rather the byte form of the last
393    *         KeyValue.
394    */
395   @Override
396   public byte[] getLastKey() {
397     return dataBlockIndexReader.isEmpty() ? null : lastKey;
398   }
399 
400   /**
401    * @return Midkey for this file. We work with block boundaries only so
402    *         returned midkey is an approximation only.
403    * @throws IOException
404    */
405   @Override
406   public byte[] midkey() throws IOException {
407     return dataBlockIndexReader.midkey();
408   }
409 
410   @Override
411   public void close() throws IOException {
412     close(cacheConf.shouldEvictOnClose());
413   }
414 
415   public void close(boolean evictOnClose) throws IOException {
416     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
417       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
418       if (LOG.isTraceEnabled()) {
419         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
420           + " block(s)");
421       }
422     }
423     if (closeIStream) {
424       if (istream != istreamNoFsChecksum && istreamNoFsChecksum != null) {
425         istreamNoFsChecksum.close();
426         istreamNoFsChecksum = null;
427       }
428       if (istream != null) {
429         istream.close();
430         istream = null;
431       }
432     }
433   }
434 
435   protected abstract static class AbstractScannerV2
436       extends AbstractHFileReader.Scanner {
437     protected HFileBlock block;
438 
439     /**
440      * The next indexed key is to keep track of the indexed key of the next data block.
441      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
442      * current data block is the last data block.
443      *
444      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
445      */
446     protected byte[] nextIndexedKey;
447 
448     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
449         final boolean pread, final boolean isCompaction) {
450       super(r, cacheBlocks, pread, isCompaction);
451     }
452 
453     /**
454      * An internal API function. Seek to the given key, optionally rewinding to
455      * the first key of the block before doing the seek.
456      *
457      * @param key key byte array
458      * @param offset key offset in the key byte array
459      * @param length key length
460      * @param rewind whether to rewind to the first key of the block before
461      *        doing the seek. If this is false, we are assuming we never go
462      *        back, otherwise the result is undefined.
463      * @return -1 if the key is earlier than the first key of the file,
464      *         0 if we are at the given key, 1 if we are past the given key
465      *         -2 if the key is earlier than the first key of the file while
466      *         using a faked index key
467      * @throws IOException
468      */
469     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
470         throws IOException {
471       HFileBlockIndex.BlockIndexReader indexReader =
472           reader.getDataBlockIndexReader();
473       BlockWithScanInfo blockWithScanInfo =
474         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
475             cacheBlocks, pread, isCompaction);
476       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
477         // This happens if the key e.g. falls before the beginning of the file.
478         return -1;
479       }
480       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
481           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
482     }
483 
484     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
485 
486     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
487         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
488         throws IOException;
489 
490     @Override
491     public int seekTo(byte[] key, int offset, int length) throws IOException {
492       // Always rewind to the first key of the block, because the given key
493       // might be before or after the current key.
494       return seekTo(key, offset, length, true);
495     }
496 
497     @Override
498     public int reseekTo(byte[] key, int offset, int length) throws IOException {
499       int compared;
500       if (isSeeked()) {
501         ByteBuffer bb = getKey();
502         compared = reader.getComparator().compare(key, offset,
503             length, bb.array(), bb.arrayOffset(), bb.limit());
504         if (compared < 1) {
505           // If the required key is less than or equal to current key, then
506           // don't do anything.
507           return compared;
508         } else {
509           if (this.nextIndexedKey != null &&
510               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
511                reader.getComparator().compare(key, offset, length,
512                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
513             // The reader shall continue to scan the current data block instead of querying the
514             // block index as long as it knows the target key is strictly smaller than
515             // the next indexed key or the current data block is the last data block.
516             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
517                 false, key, offset, length, false);
518           }
519         }
520       }
521       // Don't rewind on a reseek operation, because reseek implies that we are
522       // always going forward in the file.
523       return seekTo(key, offset, length, false);
524     }
525 
526     @Override
527     public boolean seekBefore(byte[] key, int offset, int length)
528         throws IOException {
529       HFileBlock seekToBlock =
530           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
531               block, cacheBlocks, pread, isCompaction);
532       if (seekToBlock == null) {
533         return false;
534       }
535       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
536 
537       if (reader.getComparator().compare(firstKey.array(),
538           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) == 0)
539       {
540         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
541         // The key we are interested in
542         if (previousBlockOffset == -1) {
543           // we have a 'problem', the key we want is the first of the file.
544           return false;
545         }
546 
547         // It is important that we compute and pass onDiskSize to the block
548         // reader so that it does not have to read the header separately to
549         // figure out the size.
550         seekToBlock = reader.readBlock(previousBlockOffset,
551             seekToBlock.getOffset() - previousBlockOffset, cacheBlocks,
552             pread, isCompaction, BlockType.DATA);
553         // TODO shortcut: seek forward in this block to the last key of the
554         // block.
555       }
556       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
557       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
558       return true;
559     }
560 
561 
562     /**
563      * Scans blocks in the "scanned" section of the {@link HFile} until the next
564      * data block is found.
565      *
566      * @return the next block, or null if there are no more data blocks
567      * @throws IOException
568      */
569     protected HFileBlock readNextDataBlock() throws IOException {
570       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
571       if (block == null)
572         return null;
573 
574       HFileBlock curBlock = block;
575 
576       do {
577         if (curBlock.getOffset() >= lastDataBlockOffset)
578           return null;
579 
580         if (curBlock.getOffset() < 0) {
581           throw new IOException("Invalid block file offset: " + block);
582         }
583 
584         // We are reading the next block without block type validation, because
585         // it might turn out to be a non-data block.
586         curBlock = reader.readBlock(curBlock.getOffset()
587             + curBlock.getOnDiskSizeWithHeader(),
588             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
589             isCompaction, null);
590       } while (!(curBlock.getBlockType().equals(BlockType.DATA) ||
591           curBlock.getBlockType().equals(BlockType.ENCODED_DATA)));
592 
593       return curBlock;
594     }
595   }
596 
597   /**
598    * Implementation of {@link HFileScanner} interface.
599    */
600   protected static class ScannerV2 extends AbstractScannerV2 {
601     private HFileReaderV2 reader;
602 
603     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
604         final boolean pread, final boolean isCompaction) {
605       super(r, cacheBlocks, pread, isCompaction);
606       this.reader = r;
607     }
608 
609     @Override
610     public KeyValue getKeyValue() {
611       if (!isSeeked())
612         return null;
613 
614       KeyValue ret = new KeyValue(blockBuffer.array(),
615           blockBuffer.arrayOffset() + blockBuffer.position(),
616           KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen,
617           currKeyLen);
618       if (this.reader.shouldIncludeMemstoreTS()) {
619         ret.setMemstoreTS(currMemstoreTS);
620       }
621       return ret;
622     }
623 
624     @Override
625     public ByteBuffer getKey() {
626       assertSeeked();
627       return ByteBuffer.wrap(
628           blockBuffer.array(),
629           blockBuffer.arrayOffset() + blockBuffer.position()
630               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
631     }
632 
633     @Override
634     public ByteBuffer getValue() {
635       assertSeeked();
636       return ByteBuffer.wrap(
637           blockBuffer.array(),
638           blockBuffer.arrayOffset() + blockBuffer.position()
639               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
640     }
641 
642     private void setNonSeekedState() {
643       block = null;
644       blockBuffer = null;
645       currKeyLen = 0;
646       currValueLen = 0;
647       currMemstoreTS = 0;
648       currMemstoreTSLen = 0;
649     }
650 
651     /**
652      * Go to the next key/value in the block section. Loads the next block if
653      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
654      * be called.
655      *
656      * @return true if successfully navigated to the next key/value
657      */
658     @Override
659     public boolean next() throws IOException {
660       assertSeeked();
661 
662       try {
663         blockBuffer.position(blockBuffer.position() + KEY_VALUE_LEN_SIZE
664             + currKeyLen + currValueLen + currMemstoreTSLen);
665       } catch (IllegalArgumentException e) {
666         LOG.error("Current pos = " + blockBuffer.position()
667             + "; currKeyLen = " + currKeyLen + "; currValLen = "
668             + currValueLen + "; block limit = " + blockBuffer.limit()
669             + "; HFile name = " + reader.getName()
670             + "; currBlock currBlockOffset = " + block.getOffset());
671         throw e;
672       }
673 
674       if (blockBuffer.remaining() <= 0) {
675         long lastDataBlockOffset =
676             reader.getTrailer().getLastDataBlockOffset();
677 
678         if (block.getOffset() >= lastDataBlockOffset) {
679           setNonSeekedState();
680           return false;
681         }
682 
683         // read the next block
684         HFileBlock nextBlock = readNextDataBlock();
685         if (nextBlock == null) {
686           setNonSeekedState();
687           return false;
688         }
689 
690         updateCurrBlock(nextBlock);
691         return true;
692       }
693 
694       // We are still in the same block.
695       readKeyValueLen();
696       return true;
697     }
698 
699     /**
700      * Positions this scanner at the start of the file.
701      *
702      * @return false if empty file; i.e. a call to next would return false and
703      *         the current key and value are undefined.
704      * @throws IOException
705      */
706     @Override
707     public boolean seekTo() throws IOException {
708       if (reader == null) {
709         return false;
710       }
711 
712       if (reader.getTrailer().getEntryCount() == 0) {
713         // No data blocks.
714         return false;
715       }
716 
717       long firstDataBlockOffset =
718           reader.getTrailer().getFirstDataBlockOffset();
719       if (block != null && block.getOffset() == firstDataBlockOffset) {
720         blockBuffer.rewind();
721         readKeyValueLen();
722         return true;
723       }
724 
725       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
726           isCompaction, BlockType.DATA);
727       if (block.getOffset() < 0) {
728         throw new IOException("Invalid block offset: " + block.getOffset());
729       }
730       updateCurrBlock(block);
731       return true;
732     }
733 
734     @Override
735     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
736         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
737         throws IOException {
738       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
739         updateCurrBlock(seekToBlock);
740       } else if (rewind) {
741         blockBuffer.rewind();
742       }
743 
744       // Update the nextIndexedKey
745       this.nextIndexedKey = nextIndexedKey;
746       return blockSeek(key, offset, length, seekBefore);
747     }
748 
749     /**
750      * Updates the current block to be the given {@link HFileBlock}. Seeks to
751      * the the first key/value pair.
752      *
753      * @param newBlock the block to make current
754      */
755     private void updateCurrBlock(HFileBlock newBlock) {
756       block = newBlock;
757 
758       // sanity check
759       if (block.getBlockType() != BlockType.DATA) {
760         throw new IllegalStateException("ScannerV2 works only on data " +
761             "blocks, got " + block.getBlockType() + "; " +
762             "fileName=" + reader.name + ", " +
763             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
764             "isCompaction=" + isCompaction);
765       }
766 
767       blockBuffer = block.getBufferWithoutHeader();
768       readKeyValueLen();
769       blockFetches++;
770 
771       // Reset the next indexed key
772       this.nextIndexedKey = null;
773     }
774 
775     private final void readKeyValueLen() {
776       blockBuffer.mark();
777       currKeyLen = blockBuffer.getInt();
778       currValueLen = blockBuffer.getInt();
779       blockBuffer.reset();
780       if (this.reader.shouldIncludeMemstoreTS()) {
781         if (this.reader.decodeMemstoreTS) {
782           try {
783             int memstoreTSOffset = blockBuffer.arrayOffset()
784                 + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen
785                 + currValueLen;
786             currMemstoreTS = Bytes.readVLong(blockBuffer.array(),
787                 memstoreTSOffset);
788             currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
789           } catch (Exception e) {
790             throw new RuntimeException("Error reading memstore timestamp", e);
791           }
792         } else {
793           currMemstoreTS = 0;
794           currMemstoreTSLen = 1;
795         }
796       }
797 
798       if (currKeyLen < 0 || currValueLen < 0
799           || currKeyLen > blockBuffer.limit()
800           || currValueLen > blockBuffer.limit()) {
801         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
802             + " or currValueLen " + currValueLen + ". Block offset: "
803             + block.getOffset() + ", block length: " + blockBuffer.limit()
804             + ", position: " + blockBuffer.position() + " (without header).");
805       }
806     }
807 
808     /**
809      * Within a loaded block, seek looking for the last key that is smaller
810      * than (or equal to?) the key we are interested in.
811      *
812      * A note on the seekBefore: if you have seekBefore = true, AND the first
813      * key in the block = key, then you'll get thrown exceptions. The caller has
814      * to check for that case and load the previous block as appropriate.
815      *
816      * @param key the key to find
817      * @param seekBefore find the key before the given key in case of exact
818      *          match.
819      * @return 0 in case of an exact key match, 1 in case of an inexact match,
820      *         -2 in case of an inexact match and furthermore, the input key less
821      *         than the first key of current block(e.g. using a faked index key)
822      */
823     private int blockSeek(byte[] key, int offset, int length,
824         boolean seekBefore) {
825       int klen, vlen;
826       long memstoreTS = 0;
827       int memstoreTSLen = 0;
828       int lastKeyValueSize = -1;
829       do {
830         blockBuffer.mark();
831         klen = blockBuffer.getInt();
832         vlen = blockBuffer.getInt();
833         blockBuffer.reset();
834         if (this.reader.shouldIncludeMemstoreTS()) {
835           if (this.reader.decodeMemstoreTS) {
836             try {
837               int memstoreTSOffset = blockBuffer.arrayOffset()
838                   + blockBuffer.position() + KEY_VALUE_LEN_SIZE + klen + vlen;
839               memstoreTS = Bytes.readVLong(blockBuffer.array(),
840                   memstoreTSOffset);
841               memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
842             } catch (Exception e) {
843               throw new RuntimeException("Error reading memstore timestamp", e);
844             }
845           } else {
846             memstoreTS = 0;
847             memstoreTSLen = 1;
848           }
849         }
850 
851         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
852             + KEY_VALUE_LEN_SIZE;
853         int comp = reader.getComparator().compare(key, offset, length,
854             blockBuffer.array(), keyOffset, klen);
855 
856         if (comp == 0) {
857           if (seekBefore) {
858             if (lastKeyValueSize < 0) {
859               throw new IllegalStateException("blockSeek with seekBefore "
860                   + "at the first key of the block: key="
861                   + Bytes.toStringBinary(key) + ", blockOffset="
862                   + block.getOffset() + ", onDiskSize="
863                   + block.getOnDiskSizeWithHeader());
864             }
865             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
866             readKeyValueLen();
867             return 1; // non exact match.
868           }
869           currKeyLen = klen;
870           currValueLen = vlen;
871           if (this.reader.shouldIncludeMemstoreTS()) {
872             currMemstoreTS = memstoreTS;
873             currMemstoreTSLen = memstoreTSLen;
874           }
875           return 0; // indicate exact match
876         } else if (comp < 0) {
877           if (lastKeyValueSize > 0)
878             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
879           readKeyValueLen();
880           if (lastKeyValueSize == -1 && blockBuffer.position() == 0
881               && this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY) {
882             return HConstants.INDEX_KEY_MAGIC;
883           }
884           return 1;
885         }
886 
887         // The size of this key/value tuple, including key/value length fields.
888         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
889         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
890       } while (blockBuffer.remaining() > 0);
891 
892       // Seek to the last key we successfully read. This will happen if this is
893       // the last key/value pair in the file, in which case the following call
894       // to next() has to return false.
895       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
896       readKeyValueLen();
897       return 1; // didn't exactly find it.
898     }
899 
900     @Override
901     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
902       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
903       // It is safe to manipulate this buffer because we own the buffer object.
904       buffer.rewind();
905       int klen = buffer.getInt();
906       buffer.getInt();
907       ByteBuffer keyBuff = buffer.slice();
908       keyBuff.limit(klen);
909       keyBuff.rewind();
910       return keyBuff;
911     }
912 
913     @Override
914     public String getKeyString() {
915       return Bytes.toStringBinary(blockBuffer.array(),
916           blockBuffer.arrayOffset() + blockBuffer.position()
917               + KEY_VALUE_LEN_SIZE, currKeyLen);
918     }
919 
920     @Override
921     public String getValueString() {
922       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
923           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
924           currValueLen);
925     }
926   }
927 
928   /**
929    * ScannerV2 that operates on encoded data blocks.
930    */
931   protected static class EncodedScannerV2 extends AbstractScannerV2 {
932     private DataBlockEncoder.EncodedSeeker seeker = null;
933     private DataBlockEncoder dataBlockEncoder = null;
934     private final boolean includesMemstoreTS;
935 
936     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
937         boolean pread, boolean isCompaction, boolean includesMemstoreTS) {
938       super(reader, cacheBlocks, pread, isCompaction);
939       this.includesMemstoreTS = includesMemstoreTS;
940     }
941 
942     private void setDataBlockEncoder(DataBlockEncoder dataBlockEncoder) {
943       this.dataBlockEncoder = dataBlockEncoder;
944       seeker = dataBlockEncoder.createSeeker(reader.getComparator(),
945           includesMemstoreTS);
946     }
947 
948     /**
949      * Updates the current block to be the given {@link HFileBlock}. Seeks to
950      * the the first key/value pair.
951      *
952      * @param newBlock the block to make current
953      */
954     private void updateCurrentBlock(HFileBlock newBlock) {
955       block = newBlock;
956 
957       // sanity checks
958       if (block.getBlockType() != BlockType.ENCODED_DATA) {
959         throw new IllegalStateException(
960             "EncodedScannerV2 works only on encoded data blocks");
961       }
962 
963       short dataBlockEncoderId = block.getDataBlockEncodingId();
964       if (dataBlockEncoder == null ||
965           !DataBlockEncoding.isCorrectEncoder(dataBlockEncoder,
966               dataBlockEncoderId)) {
967         DataBlockEncoder encoder =
968             DataBlockEncoding.getDataBlockEncoderById(dataBlockEncoderId);
969         setDataBlockEncoder(encoder);
970       }
971 
972       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
973       blockFetches++;
974     }
975 
976     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
977       ByteBuffer origBlock = newBlock.getBufferReadOnly();
978       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
979           origBlock.arrayOffset() + newBlock.headerSize() +
980           DataBlockEncoding.ID_SIZE,
981           newBlock.getUncompressedSizeWithoutHeader() -
982           DataBlockEncoding.ID_SIZE).slice();
983       return encodedBlock;
984     }
985 
986     @Override
987     public boolean seekTo() throws IOException {
988       if (reader == null) {
989         return false;
990       }
991 
992       if (reader.getTrailer().getEntryCount() == 0) {
993         // No data blocks.
994         return false;
995       }
996 
997       long firstDataBlockOffset =
998           reader.getTrailer().getFirstDataBlockOffset();
999       if (block != null && block.getOffset() == firstDataBlockOffset) {
1000         seeker.rewind();
1001         return true;
1002       }
1003 
1004       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1005           isCompaction, BlockType.DATA);
1006       if (block.getOffset() < 0) {
1007         throw new IOException("Invalid block offset: " + block.getOffset());
1008       }
1009       updateCurrentBlock(block);
1010       return true;
1011     }
1012 
1013     @Override
1014     public boolean next() throws IOException {
1015       boolean isValid = seeker.next();
1016       if (!isValid) {
1017         block = readNextDataBlock();
1018         isValid = block != null;
1019         if (isValid) {
1020           updateCurrentBlock(block);
1021         }
1022       }
1023       return isValid;
1024     }
1025 
1026     @Override
1027     public ByteBuffer getKey() {
1028       assertValidSeek();
1029       return seeker.getKeyDeepCopy();
1030     }
1031 
1032     @Override
1033     public ByteBuffer getValue() {
1034       assertValidSeek();
1035       return seeker.getValueShallowCopy();
1036     }
1037 
1038     @Override
1039     public KeyValue getKeyValue() {
1040       if (block == null) {
1041         return null;
1042       }
1043       return seeker.getKeyValue();
1044     }
1045 
1046     @Override
1047     public String getKeyString() {
1048       ByteBuffer keyBuffer = getKey();
1049       return Bytes.toStringBinary(keyBuffer.array(),
1050           keyBuffer.arrayOffset(), keyBuffer.limit());
1051     }
1052 
1053     @Override
1054     public String getValueString() {
1055       ByteBuffer valueBuffer = getValue();
1056       return Bytes.toStringBinary(valueBuffer.array(),
1057           valueBuffer.arrayOffset(), valueBuffer.limit());
1058     }
1059 
1060     private void assertValidSeek() {
1061       if (block == null) {
1062         throw new NotSeekedException();
1063       }
1064     }
1065 
1066     @Override
1067     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1068       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1069     }
1070 
1071     @Override
1072     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1073         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1074         throws IOException  {
1075       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1076         updateCurrentBlock(seekToBlock);
1077       } else if (rewind) {
1078         seeker.rewind();
1079       }
1080       this.nextIndexedKey = nextIndexedKey;
1081       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1082     }
1083   }
1084 
1085   /**
1086    * Returns a buffer with the Bloom filter metadata. The caller takes
1087    * ownership of the buffer.
1088    */
1089   @Override
1090   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1091     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1092   }
1093 
1094   @Override
1095   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1096     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1097   }
1098 
1099   private DataInput getBloomFilterMetadata(BlockType blockType)
1100   throws IOException {
1101     if (blockType != BlockType.GENERAL_BLOOM_META &&
1102         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1103       throw new RuntimeException("Block Type: " + blockType.toString() +
1104           " is not supported") ;
1105     }
1106 
1107     for (HFileBlock b : loadOnOpenBlocks)
1108       if (b.getBlockType() == blockType)
1109         return b.getByteStream();
1110     return null;
1111   }
1112 
1113   @Override
1114   public boolean isFileInfoLoaded() {
1115     return true; // We load file info in constructor in version 2.
1116   }
1117 
1118   /**
1119    * Validates that the minor version is within acceptable limits.
1120    * Otherwise throws an Runtime exception
1121    */
1122   private void validateMinorVersion(Path path, int minorVersion) {
1123     if (minorVersion < MIN_MINOR_VERSION ||
1124         minorVersion > MAX_MINOR_VERSION) {
1125       String msg = "Minor version for path " + path + 
1126                    " is expected to be between " +
1127                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1128                    " but is found to be " + minorVersion;
1129       LOG.error(msg);
1130       throw new RuntimeException(msg);
1131     }
1132   }
1133 }