View Javadoc

1   /*
2    * Copyright 2011 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.io.hfile;
21  
22  import java.io.DataInput;
23  import java.io.IOException;
24  import java.nio.ByteBuffer;
25  import java.util.ArrayList;
26  import java.util.List;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.fs.FSDataInputStream;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.KeyValue;
34  import org.apache.hadoop.hbase.fs.HFileSystem;
35  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
36  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
37  import org.apache.hadoop.hbase.io.hfile.BlockType.BlockCategory;
38  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.hbase.util.IdLock;
41  import org.apache.hadoop.io.RawComparator;
42  import org.apache.hadoop.io.WritableUtils;
43  
44  /**
45   * {@link HFile} reader for version 2.
46   */
47  public class HFileReaderV2 extends AbstractHFileReader {
48  
49    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
50  
51    /**
52     * The size of a (key length, value length) tuple that prefixes each entry in
53     * a data block.
54     */
55    private static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
56  
57    private boolean includesMemstoreTS = false;
58    private boolean decodeMemstoreTS = false;
59  
60    private boolean shouldIncludeMemstoreTS() {
61      return includesMemstoreTS;
62    }
63  
64    /**
65     * A "sparse lock" implementation allowing to lock on a particular block
66     * identified by offset. The purpose of this is to avoid two clients loading
67     * the same block, and have all but one client wait to get the block from the
68     * cache.
69     */
70    private IdLock offsetLock = new IdLock();
71  
72    /**
73     * Blocks read from the load-on-open section, excluding data root index, meta
74     * index, and file info.
75     */
76    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
77  
78    /** Minimum minor version supported by this HFile format */
79    static final int MIN_MINOR_VERSION = 0;
80  
81    /** Maximum minor version supported by this HFile format */
82    static final int MAX_MINOR_VERSION = 1;
83  
84    /**
85     * Opens a HFile. You must load the index before you can use it by calling
86     * {@link #loadFileInfo()}.
87     *
88     * @param path Path to HFile.
89     * @param trailer File trailer.
90     * @param fsdis input stream. Caller is responsible for closing the passed
91     *          stream.
92     * @param size Length of the stream.
93     * @param closeIStream Whether to close the stream.
94     * @param cacheConf Cache configuration.
95     * @param preferredEncodingInCache the encoding to use in cache in case we
96     *          have a choice. If the file is already encoded on disk, we will
97     *          still use its on-disk encoding in cache.
98     */
99    public HFileReaderV2(Path path, FixedFileTrailer trailer,
100       final FSDataInputStream fsdis, final FSDataInputStream fsdisNoFsChecksum,
101       final long size,
102       final boolean closeIStream, final CacheConfig cacheConf,
103       DataBlockEncoding preferredEncodingInCache, final HFileSystem hfs)
104       throws IOException {
105     super(path, trailer, fsdis, fsdisNoFsChecksum, size, 
106           closeIStream, cacheConf, hfs);
107     trailer.expectMajorVersion(2);
108     validateMinorVersion(path, trailer.getMinorVersion());
109     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis,
110         fsdisNoFsChecksum,
111         compressAlgo, fileSize, trailer.getMinorVersion(), hfs, path);
112     this.fsBlockReader = fsBlockReaderV2; // upcast
113 
114     // Comparator class name is stored in the trailer in version 2.
115     comparator = trailer.createComparator();
116     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
117         trailer.getNumDataIndexLevels(), this);
118     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
119         Bytes.BYTES_RAWCOMPARATOR, 1);
120 
121     // Parse load-on-open data.
122 
123     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
124         trailer.getLoadOnOpenDataOffset(),
125         fileSize - trailer.getTrailerSize());
126 
127     // Data index. We also read statistics about the block index written after
128     // the root level.
129     dataBlockIndexReader.readMultiLevelIndexRoot(
130         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
131         trailer.getDataIndexCount());
132 
133     // Meta index.
134     metaBlockIndexReader.readRootIndex(
135         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
136         trailer.getMetaIndexCount());
137 
138     // File info
139     fileInfo = new FileInfo();
140     fileInfo.readFields(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
141     lastKey = fileInfo.get(FileInfo.LASTKEY);
142     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
143     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
144     byte [] keyValueFormatVersion =
145         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
146     includesMemstoreTS = keyValueFormatVersion != null &&
147         Bytes.toInt(keyValueFormatVersion) ==
148             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
149     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
150     if (includesMemstoreTS) {
151       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
152     }
153 
154     // Read data block encoding algorithm name from file info.
155     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo,
156         preferredEncodingInCache);
157     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
158 
159     // Store all other load-on-open blocks for further consumption.
160     HFileBlock b;
161     while ((b = blockIter.nextBlock()) != null) {
162       loadOnOpenBlocks.add(b);
163     }
164   }
165 
166   /**
167    * Create a Scanner on this file. No seeks or reads are done on creation. Call
168    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
169    * nothing to clean up in a Scanner. Letting go of your references to the
170    * scanner is sufficient.
171    *
172    * @param cacheBlocks True if we should cache blocks read in by this scanner.
173    * @param pread Use positional read rather than seek+read if true (pread is
174    *          better for random reads, seek+read is better scanning).
175    * @param isCompaction is scanner being used for a compaction?
176    * @return Scanner on this file.
177    */
178    @Override
179    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
180       final boolean isCompaction) {
181     // check if we want to use data block encoding in memory
182     if (dataBlockEncoder.useEncodedScanner(isCompaction)) {
183       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
184           includesMemstoreTS);
185     }
186 
187     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
188   }
189 
190   /**
191    * @param metaBlockName
192    * @param cacheBlock Add block to cache, if found
193    * @return block wrapped in a ByteBuffer, with header skipped
194    * @throws IOException
195    */
196   @Override
197   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
198       throws IOException {
199     if (trailer.getMetaIndexCount() == 0) {
200       return null; // there are no meta blocks
201     }
202     if (metaBlockIndexReader == null) {
203       throw new IOException("Meta index not loaded");
204     }
205 
206     byte[] mbname = Bytes.toBytes(metaBlockName);
207     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
208         mbname.length);
209     if (block == -1)
210       return null;
211     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
212     long startTimeNs = System.nanoTime();
213 
214     // Per meta key from any given file, synchronize reads for said block. This
215     // is OK to do for meta blocks because the meta block index is always
216     // single-level.
217     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
218       // Check cache for block. If found return.
219       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
220       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
221           DataBlockEncoding.NONE, BlockType.META);
222 
223       cacheBlock &= cacheConf.shouldCacheDataOnRead();
224       if (cacheConf.isBlockCacheEnabled()) {
225         HFileBlock cachedBlock =
226           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false);
227         if (cachedBlock != null) {
228           // Return a distinct 'shallow copy' of the block,
229           // so pos does not get messed by the scanner
230           getSchemaMetrics().updateOnCacheHit(BlockCategory.META, false);
231           return cachedBlock.getBufferWithoutHeader();
232         }
233         // Cache Miss, please load.
234       }
235 
236       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
237           blockSize, -1, true);
238       passSchemaMetricsTo(metaBlock);
239 
240       final long delta = System.nanoTime() - startTimeNs;
241       HFile.offerReadLatency(delta, true);
242       getSchemaMetrics().updateOnCacheMiss(BlockCategory.META, false, delta);
243 
244       // Cache the block
245       if (cacheBlock) {
246         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
247             cacheConf.isInMemory());
248       }
249 
250       return metaBlock.getBufferWithoutHeader();
251     }
252   }
253 
254   /**
255    * Read in a file block.
256    * @param dataBlockOffset offset to read.
257    * @param onDiskBlockSize size of the block
258    * @param cacheBlock
259    * @param pread Use positional read instead of seek+read (positional is
260    *          better doing random reads whereas seek+read is better scanning).
261    * @param isCompaction is this block being read as part of a compaction
262    * @param expectedBlockType the block type we are expecting to read with this
263    *          read operation, or null to read whatever block type is available
264    *          and avoid checking (that might reduce caching efficiency of
265    *          encoded data blocks)
266    * @return Block wrapped in a ByteBuffer.
267    * @throws IOException
268    */
269   @Override
270   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
271       final boolean cacheBlock, boolean pread, final boolean isCompaction,
272       BlockType expectedBlockType)
273       throws IOException {
274     if (dataBlockIndexReader == null) {
275       throw new IOException("Block index not loaded");
276     }
277     if (dataBlockOffset < 0
278         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
279       throw new IOException("Requested block is out of range: "
280           + dataBlockOffset + ", lastDataBlockOffset: "
281           + trailer.getLastDataBlockOffset());
282     }
283     // For any given block from any given file, synchronize reads for said
284     // block.
285     // Without a cache, this synchronizing is needless overhead, but really
286     // the other choice is to duplicate work (which the cache would prevent you
287     // from doing).
288 
289     BlockCacheKey cacheKey =
290         new BlockCacheKey(name, dataBlockOffset,
291             dataBlockEncoder.getEffectiveEncodingInCache(isCompaction),
292             expectedBlockType);
293 
294     boolean useLock = false;
295     IdLock.Entry lockEntry = null;
296 
297     try {
298       while (true) {
299 
300         if (useLock) {
301           lockEntry = offsetLock.getLockEntry(dataBlockOffset);
302         }
303 
304         // Check cache for block. If found return.
305         if (cacheConf.isBlockCacheEnabled()) {
306           // Try and get the block from the block cache.  If the useLock variable is true then this
307           // is the second time through the loop and it should not be counted as a block cache miss.
308           HFileBlock cachedBlock = (HFileBlock)
309               cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, useLock);
310           if (cachedBlock != null) {
311             BlockCategory blockCategory =
312                 cachedBlock.getBlockType().getCategory();
313 
314             getSchemaMetrics().updateOnCacheHit(blockCategory, isCompaction);
315 
316             if (cachedBlock.getBlockType() == BlockType.DATA) {
317               HFile.dataBlockReadCnt.incrementAndGet();
318             }
319 
320             validateBlockType(cachedBlock, expectedBlockType);
321 
322             // Validate encoding type for encoded blocks. We include encoding
323             // type in the cache key, and we expect it to match on a cache hit.
324             if (cachedBlock.getBlockType() == BlockType.ENCODED_DATA &&
325                 cachedBlock.getDataBlockEncoding() !=
326                     dataBlockEncoder.getEncodingInCache()) {
327               throw new IOException("Cached block under key " + cacheKey + " " +
328                   "has wrong encoding: " + cachedBlock.getDataBlockEncoding() +
329                   " (expected: " + dataBlockEncoder.getEncodingInCache() + ")");
330             }
331             return cachedBlock;
332           }
333           // Carry on, please load.
334         }
335         if (!useLock) {
336           // check cache again with lock
337           useLock = true;
338           continue;
339         }
340 
341         // Load block from filesystem.
342         long startTimeNs = System.nanoTime();
343         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset,
344             onDiskBlockSize, -1, pread);
345         hfileBlock = dataBlockEncoder.diskToCacheFormat(hfileBlock,
346             isCompaction);
347         validateBlockType(hfileBlock, expectedBlockType);
348         passSchemaMetricsTo(hfileBlock);
349         BlockCategory blockCategory = hfileBlock.getBlockType().getCategory();
350 
351         final long delta = System.nanoTime() - startTimeNs;
352         HFile.offerReadLatency(delta, pread);
353         getSchemaMetrics().updateOnCacheMiss(blockCategory, isCompaction, delta);
354 
355         // Cache the block if necessary
356         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(
357             hfileBlock.getBlockType().getCategory())) {
358           cacheConf.getBlockCache().cacheBlock(cacheKey, hfileBlock,
359               cacheConf.isInMemory());
360         }
361 
362         if (hfileBlock.getBlockType() == BlockType.DATA) {
363           HFile.dataBlockReadCnt.incrementAndGet();
364         }
365 
366         return hfileBlock;
367       }
368     } finally {
369       if (lockEntry != null) {
370         offsetLock.releaseLockEntry(lockEntry);
371       }
372     }
373   }
374 
375   @Override
376   public boolean hasMVCCInfo() {
377     return includesMemstoreTS && decodeMemstoreTS;
378   }
379 
380   /**
381    * Compares the actual type of a block retrieved from cache or disk with its
382    * expected type and throws an exception in case of a mismatch. Expected
383    * block type of {@link BlockType#DATA} is considered to match the actual
384    * block type [@link {@link BlockType#ENCODED_DATA} as well.
385    * @param block a block retrieved from cache or disk
386    * @param expectedBlockType the expected block type, or null to skip the
387    *          check
388    */
389   private void validateBlockType(HFileBlock block,
390       BlockType expectedBlockType) throws IOException {
391     if (expectedBlockType == null) {
392       return;
393     }
394     BlockType actualBlockType = block.getBlockType();
395     if (actualBlockType == BlockType.ENCODED_DATA &&
396         expectedBlockType == BlockType.DATA) {
397       // We consider DATA to match ENCODED_DATA for the purpose of this
398       // verification.
399       return;
400     }
401     if (actualBlockType != expectedBlockType) {
402       throw new IOException("Expected block type " + expectedBlockType + ", " +
403           "but got " + actualBlockType + ": " + block);
404     }
405   }
406 
407   /**
408    * @return Last key in the file. May be null if file has no entries. Note that
409    *         this is not the last row key, but rather the byte form of the last
410    *         KeyValue.
411    */
412   @Override
413   public byte[] getLastKey() {
414     return dataBlockIndexReader.isEmpty() ? null : lastKey;
415   }
416 
417   /**
418    * @return Midkey for this file. We work with block boundaries only so
419    *         returned midkey is an approximation only.
420    * @throws IOException
421    */
422   @Override
423   public byte[] midkey() throws IOException {
424     return dataBlockIndexReader.midkey();
425   }
426 
427   @Override
428   public void close() throws IOException {
429     close(cacheConf.shouldEvictOnClose());
430   }
431 
432   public void close(boolean evictOnClose) throws IOException {
433     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
434       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
435       if (LOG.isTraceEnabled()) {
436         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
437           + " block(s)");
438       }
439     }
440     if (closeIStream) {
441       if (istream != istreamNoFsChecksum && istreamNoFsChecksum != null) {
442         istreamNoFsChecksum.close();
443         istreamNoFsChecksum = null;
444       }
445       if (istream != null) {
446         istream.close();
447         istream = null;
448       }
449     }
450 
451     getSchemaMetrics().flushMetrics();
452   }
453 
454   protected abstract static class AbstractScannerV2
455       extends AbstractHFileReader.Scanner {
456     protected HFileBlock block;
457 
458     /**
459      * The next indexed key is to keep track of the indexed key of the next data block.
460      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
461      * current data block is the last data block.
462      *
463      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
464      */
465     protected byte[] nextIndexedKey;
466 
467     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
468         final boolean pread, final boolean isCompaction) {
469       super(r, cacheBlocks, pread, isCompaction);
470     }
471 
472     /**
473      * An internal API function. Seek to the given key, optionally rewinding to
474      * the first key of the block before doing the seek.
475      *
476      * @param key key byte array
477      * @param offset key offset in the key byte array
478      * @param length key length
479      * @param rewind whether to rewind to the first key of the block before
480      *        doing the seek. If this is false, we are assuming we never go
481      *        back, otherwise the result is undefined.
482      * @return -1 if the key is earlier than the first key of the file,
483      *         0 if we are at the given key, and 1 if we are past the given key
484      * @throws IOException
485      */
486     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
487         throws IOException {
488       HFileBlockIndex.BlockIndexReader indexReader =
489           reader.getDataBlockIndexReader();
490       BlockWithScanInfo blockWithScanInfo =
491         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
492             cacheBlocks, pread, isCompaction);
493       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
494         // This happens if the key e.g. falls before the beginning of the file.
495         return -1;
496       }
497       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
498           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
499     }
500 
501     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
502 
503     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
504         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
505         throws IOException;
506 
507     @Override
508     public int seekTo(byte[] key, int offset, int length) throws IOException {
509       // Always rewind to the first key of the block, because the given key
510       // might be before or after the current key.
511       return seekTo(key, offset, length, true);
512     }
513 
514     @Override
515     public int reseekTo(byte[] key, int offset, int length) throws IOException {
516       int compared;
517       if (isSeeked()) {
518         compared = compareKey(reader.getComparator(), key, offset, length);
519         if (compared < 1) {
520           // If the required key is less than or equal to current key, then
521           // don't do anything.
522           return compared;
523         } else {
524           if (this.nextIndexedKey != null &&
525               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
526                reader.getComparator().compare(key, offset, length,
527                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
528             // The reader shall continue to scan the current data block instead of querying the
529             // block index as long as it knows the target key is strictly smaller than
530             // the next indexed key or the current data block is the last data block.
531             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
532                 false, key, offset, length, false);
533           }
534         }
535       }
536       // Don't rewind on a reseek operation, because reseek implies that we are
537       // always going forward in the file.
538       return seekTo(key, offset, length, false);
539     }
540 
541     @Override
542     public boolean seekBefore(byte[] key, int offset, int length)
543         throws IOException {
544       HFileBlock seekToBlock =
545           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
546               block, cacheBlocks, pread, isCompaction);
547       if (seekToBlock == null) {
548         return false;
549       }
550       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
551 
552       if (reader.getComparator().compare(firstKey.array(),
553           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) == 0)
554       {
555         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
556         // The key we are interested in
557         if (previousBlockOffset == -1) {
558           // we have a 'problem', the key we want is the first of the file.
559           return false;
560         }
561 
562         // It is important that we compute and pass onDiskSize to the block
563         // reader so that it does not have to read the header separately to
564         // figure out the size.
565         seekToBlock = reader.readBlock(previousBlockOffset,
566             seekToBlock.getOffset() - previousBlockOffset, cacheBlocks,
567             pread, isCompaction, BlockType.DATA);
568         // TODO shortcut: seek forward in this block to the last key of the
569         // block.
570       }
571       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
572       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
573       return true;
574     }
575 
576 
577     /**
578      * Scans blocks in the "scanned" section of the {@link HFile} until the next
579      * data block is found.
580      *
581      * @return the next block, or null if there are no more data blocks
582      * @throws IOException
583      */
584     protected HFileBlock readNextDataBlock() throws IOException {
585       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
586       if (block == null)
587         return null;
588 
589       HFileBlock curBlock = block;
590 
591       do {
592         if (curBlock.getOffset() >= lastDataBlockOffset)
593           return null;
594 
595         if (curBlock.getOffset() < 0) {
596           throw new IOException("Invalid block file offset: " + block);
597         }
598 
599         // We are reading the next block without block type validation, because
600         // it might turn out to be a non-data block.
601         curBlock = reader.readBlock(curBlock.getOffset()
602             + curBlock.getOnDiskSizeWithHeader(),
603             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
604             isCompaction, null);
605       } while (!(curBlock.getBlockType().equals(BlockType.DATA) ||
606           curBlock.getBlockType().equals(BlockType.ENCODED_DATA)));
607 
608       return curBlock;
609     }
610     /**
611      * Compare the given key against the current key
612      * @param comparator
613      * @param key
614      * @param offset
615      * @param length
616      * @return -1 is the passed key is smaller than the current key, 0 if equal and 1 if greater
617      */
618     public abstract int compareKey(RawComparator<byte[]> comparator, byte[] key, int offset,
619         int length);
620   }
621 
622   /**
623    * Implementation of {@link HFileScanner} interface.
624    */
625   protected static class ScannerV2 extends AbstractScannerV2 {
626     private HFileReaderV2 reader;
627 
628     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
629         final boolean pread, final boolean isCompaction) {
630       super(r, cacheBlocks, pread, isCompaction);
631       this.reader = r;
632     }
633 
634     @Override
635     public KeyValue getKeyValue() {
636       if (!isSeeked())
637         return null;
638 
639       KeyValue ret = new KeyValue(blockBuffer.array(),
640           blockBuffer.arrayOffset() + blockBuffer.position(),
641           KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen,
642           currKeyLen);
643       if (this.reader.shouldIncludeMemstoreTS()) {
644         ret.setMemstoreTS(currMemstoreTS);
645       }
646       return ret;
647     }
648 
649     @Override
650     public ByteBuffer getKey() {
651       assertSeeked();
652       return ByteBuffer.wrap(
653           blockBuffer.array(),
654           blockBuffer.arrayOffset() + blockBuffer.position()
655               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
656     }
657 
658     @Override
659     public int compareKey(RawComparator<byte []> comparator, byte[] key, int offset, int length) {
660       return comparator.compare(key, offset, length, blockBuffer.array(), blockBuffer.arrayOffset()
661           + blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen);
662       }
663 
664     @Override
665     public ByteBuffer getValue() {
666       assertSeeked();
667       return ByteBuffer.wrap(
668           blockBuffer.array(),
669           blockBuffer.arrayOffset() + blockBuffer.position()
670               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
671     }
672 
673     private void setNonSeekedState() {
674       block = null;
675       blockBuffer = null;
676       currKeyLen = 0;
677       currValueLen = 0;
678       currMemstoreTS = 0;
679       currMemstoreTSLen = 0;
680     }
681 
682     /**
683      * Go to the next key/value in the block section. Loads the next block if
684      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
685      * be called.
686      *
687      * @return true if successfully navigated to the next key/value
688      */
689     @Override
690     public boolean next() throws IOException {
691       assertSeeked();
692 
693       try {
694         blockBuffer.position(blockBuffer.position() + KEY_VALUE_LEN_SIZE
695             + currKeyLen + currValueLen + currMemstoreTSLen);
696       } catch (IllegalArgumentException e) {
697         LOG.error("Current pos = " + blockBuffer.position()
698             + "; currKeyLen = " + currKeyLen + "; currValLen = "
699             + currValueLen + "; block limit = " + blockBuffer.limit()
700             + "; HFile name = " + reader.getName()
701             + "; currBlock currBlockOffset = " + block.getOffset());
702         throw e;
703       }
704 
705       if (blockBuffer.remaining() <= 0) {
706         long lastDataBlockOffset =
707             reader.getTrailer().getLastDataBlockOffset();
708 
709         if (block.getOffset() >= lastDataBlockOffset) {
710           setNonSeekedState();
711           return false;
712         }
713 
714         // read the next block
715         HFileBlock nextBlock = readNextDataBlock();
716         if (nextBlock == null) {
717           setNonSeekedState();
718           return false;
719         }
720 
721         updateCurrBlock(nextBlock);
722         return true;
723       }
724 
725       // We are still in the same block.
726       readKeyValueLen();
727       return true;
728     }
729 
730     /**
731      * Positions this scanner at the start of the file.
732      *
733      * @return false if empty file; i.e. a call to next would return false and
734      *         the current key and value are undefined.
735      * @throws IOException
736      */
737     @Override
738     public boolean seekTo() throws IOException {
739       if (reader == null) {
740         return false;
741       }
742 
743       if (reader.getTrailer().getEntryCount() == 0) {
744         // No data blocks.
745         return false;
746       }
747 
748       long firstDataBlockOffset =
749           reader.getTrailer().getFirstDataBlockOffset();
750       if (block != null && block.getOffset() == firstDataBlockOffset) {
751         blockBuffer.rewind();
752         readKeyValueLen();
753         return true;
754       }
755 
756       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
757           isCompaction, BlockType.DATA);
758       if (block.getOffset() < 0) {
759         throw new IOException("Invalid block offset: " + block.getOffset());
760       }
761       updateCurrBlock(block);
762       return true;
763     }
764 
765     @Override
766     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
767         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
768         throws IOException {
769       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
770         updateCurrBlock(seekToBlock);
771       } else if (rewind) {
772         blockBuffer.rewind();
773       }
774 
775       // Update the nextIndexedKey
776       this.nextIndexedKey = nextIndexedKey;
777       return blockSeek(key, offset, length, seekBefore);
778     }
779 
780     /**
781      * Updates the current block to be the given {@link HFileBlock}. Seeks to
782      * the the first key/value pair.
783      *
784      * @param newBlock the block to make current
785      */
786     private void updateCurrBlock(HFileBlock newBlock) {
787       block = newBlock;
788 
789       // sanity check
790       if (block.getBlockType() != BlockType.DATA) {
791         throw new IllegalStateException("ScannerV2 works only on data " +
792             "blocks, got " + block.getBlockType() + "; " +
793             "fileName=" + reader.name + ", " +
794             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
795             "isCompaction=" + isCompaction);
796       }
797 
798       blockBuffer = block.getBufferWithoutHeader();
799       readKeyValueLen();
800       blockFetches++;
801 
802       // Reset the next indexed key
803       this.nextIndexedKey = null;
804     }
805 
806     private final void readKeyValueLen() {
807       blockBuffer.mark();
808       currKeyLen = blockBuffer.getInt();
809       currValueLen = blockBuffer.getInt();
810       blockBuffer.reset();
811       if (this.reader.shouldIncludeMemstoreTS()) {
812         if (this.reader.decodeMemstoreTS) {
813           try {
814             int memstoreTSOffset = blockBuffer.arrayOffset()
815                 + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen
816                 + currValueLen;
817             currMemstoreTS = Bytes.readVLong(blockBuffer.array(),
818                 memstoreTSOffset);
819             currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
820           } catch (Exception e) {
821             throw new RuntimeException("Error reading memstore timestamp", e);
822           }
823         } else {
824           currMemstoreTS = 0;
825           currMemstoreTSLen = 1;
826         }
827       }
828 
829       if (currKeyLen < 0 || currValueLen < 0
830           || currKeyLen > blockBuffer.limit()
831           || currValueLen > blockBuffer.limit()) {
832         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
833             + " or currValueLen " + currValueLen + ". Block offset: "
834             + block.getOffset() + ", block length: " + blockBuffer.limit()
835             + ", position: " + blockBuffer.position() + " (without header).");
836       }
837     }
838 
839     /**
840      * Within a loaded block, seek looking for the last key that is smaller
841      * than (or equal to?) the key we are interested in.
842      *
843      * A note on the seekBefore: if you have seekBefore = true, AND the first
844      * key in the block = key, then you'll get thrown exceptions. The caller has
845      * to check for that case and load the previous block as appropriate.
846      *
847      * @param key the key to find
848      * @param seekBefore find the key before the given key in case of exact
849      *          match.
850      * @return 0 in case of an exact key match, 1 in case of an inexact match
851      */
852     private int blockSeek(byte[] key, int offset, int length,
853         boolean seekBefore) {
854       int klen, vlen;
855       long memstoreTS = 0;
856       int memstoreTSLen = 0;
857       int lastKeyValueSize = -1;
858       do {
859         blockBuffer.mark();
860         klen = blockBuffer.getInt();
861         vlen = blockBuffer.getInt();
862         blockBuffer.reset();
863         if (this.reader.shouldIncludeMemstoreTS()) {
864           if (this.reader.decodeMemstoreTS) {
865             try {
866               int memstoreTSOffset = blockBuffer.arrayOffset()
867                   + blockBuffer.position() + KEY_VALUE_LEN_SIZE + klen + vlen;
868               memstoreTS = Bytes.readVLong(blockBuffer.array(),
869                   memstoreTSOffset);
870               memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
871             } catch (Exception e) {
872               throw new RuntimeException("Error reading memstore timestamp", e);
873             }
874           } else {
875             memstoreTS = 0;
876             memstoreTSLen = 1;
877           }
878         }
879 
880         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
881             + KEY_VALUE_LEN_SIZE;
882         int comp = reader.getComparator().compare(key, offset, length,
883             blockBuffer.array(), keyOffset, klen);
884 
885         if (comp == 0) {
886           if (seekBefore) {
887             if (lastKeyValueSize < 0) {
888               throw new IllegalStateException("blockSeek with seekBefore "
889                   + "at the first key of the block: key="
890                   + Bytes.toStringBinary(key) + ", blockOffset="
891                   + block.getOffset() + ", onDiskSize="
892                   + block.getOnDiskSizeWithHeader());
893             }
894             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
895             readKeyValueLen();
896             return 1; // non exact match.
897           }
898           currKeyLen = klen;
899           currValueLen = vlen;
900           if (this.reader.shouldIncludeMemstoreTS()) {
901             currMemstoreTS = memstoreTS;
902             currMemstoreTSLen = memstoreTSLen;
903           }
904           return 0; // indicate exact match
905         }
906 
907         if (comp < 0) {
908           if (lastKeyValueSize > 0)
909             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
910           readKeyValueLen();
911           return 1;
912         }
913 
914         // The size of this key/value tuple, including key/value length fields.
915         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
916         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
917       } while (blockBuffer.remaining() > 0);
918 
919       // Seek to the last key we successfully read. This will happen if this is
920       // the last key/value pair in the file, in which case the following call
921       // to next() has to return false.
922       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
923       readKeyValueLen();
924       return 1; // didn't exactly find it.
925     }
926 
927     @Override
928     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
929       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
930       // It is safe to manipulate this buffer because we own the buffer object.
931       buffer.rewind();
932       int klen = buffer.getInt();
933       buffer.getInt();
934       ByteBuffer keyBuff = buffer.slice();
935       keyBuff.limit(klen);
936       keyBuff.rewind();
937       return keyBuff;
938     }
939 
940     @Override
941     public String getKeyString() {
942       return Bytes.toStringBinary(blockBuffer.array(),
943           blockBuffer.arrayOffset() + blockBuffer.position()
944               + KEY_VALUE_LEN_SIZE, currKeyLen);
945     }
946 
947     @Override
948     public String getValueString() {
949       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
950           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
951           currValueLen);
952     }
953   }
954 
955   /**
956    * ScannerV2 that operates on encoded data blocks.
957    */
958   protected static class EncodedScannerV2 extends AbstractScannerV2 {
959     private DataBlockEncoder.EncodedSeeker seeker = null;
960     private DataBlockEncoder dataBlockEncoder = null;
961     private final boolean includesMemstoreTS;
962 
963     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
964         boolean pread, boolean isCompaction, boolean includesMemstoreTS) {
965       super(reader, cacheBlocks, pread, isCompaction);
966       this.includesMemstoreTS = includesMemstoreTS;
967     }
968 
969     private void setDataBlockEncoder(DataBlockEncoder dataBlockEncoder) {
970       this.dataBlockEncoder = dataBlockEncoder;
971       seeker = dataBlockEncoder.createSeeker(reader.getComparator(),
972           includesMemstoreTS);
973     }
974 
975     /**
976      * Updates the current block to be the given {@link HFileBlock}. Seeks to
977      * the the first key/value pair.
978      *
979      * @param newBlock the block to make current
980      */
981     private void updateCurrentBlock(HFileBlock newBlock) {
982       block = newBlock;
983 
984       // sanity checks
985       if (block.getBlockType() != BlockType.ENCODED_DATA) {
986         throw new IllegalStateException(
987             "EncodedScannerV2 works only on encoded data blocks");
988       }
989 
990       short dataBlockEncoderId = block.getDataBlockEncodingId();
991       if (dataBlockEncoder == null ||
992           !DataBlockEncoding.isCorrectEncoder(dataBlockEncoder,
993               dataBlockEncoderId)) {
994         DataBlockEncoder encoder =
995             DataBlockEncoding.getDataBlockEncoderById(dataBlockEncoderId);
996         setDataBlockEncoder(encoder);
997       }
998 
999       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
1000       blockFetches++;
1001     }
1002 
1003     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
1004       ByteBuffer origBlock = newBlock.getBufferReadOnly();
1005       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
1006           origBlock.arrayOffset() + newBlock.headerSize() +
1007           DataBlockEncoding.ID_SIZE,
1008           newBlock.getUncompressedSizeWithoutHeader() -
1009           DataBlockEncoding.ID_SIZE).slice();
1010       return encodedBlock;
1011     }
1012 
1013     @Override
1014     public boolean seekTo() throws IOException {
1015       if (reader == null) {
1016         return false;
1017       }
1018 
1019       if (reader.getTrailer().getEntryCount() == 0) {
1020         // No data blocks.
1021         return false;
1022       }
1023 
1024       long firstDataBlockOffset =
1025           reader.getTrailer().getFirstDataBlockOffset();
1026       if (block != null && block.getOffset() == firstDataBlockOffset) {
1027         seeker.rewind();
1028         return true;
1029       }
1030 
1031       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1032           isCompaction, BlockType.DATA);
1033       if (block.getOffset() < 0) {
1034         throw new IOException("Invalid block offset: " + block.getOffset());
1035       }
1036       updateCurrentBlock(block);
1037       return true;
1038     }
1039 
1040     @Override
1041     public boolean next() throws IOException {
1042       boolean isValid = seeker.next();
1043       if (!isValid) {
1044         block = readNextDataBlock();
1045         isValid = block != null;
1046         if (isValid) {
1047           updateCurrentBlock(block);
1048         }
1049       }
1050       return isValid;
1051     }
1052 
1053     @Override
1054     public ByteBuffer getKey() {
1055       assertValidSeek();
1056       return seeker.getKeyDeepCopy();
1057     }
1058 
1059     @Override
1060     public int compareKey(RawComparator<byte []> comparator, byte[] key, int offset, int length) {
1061       return seeker.compareKey(comparator, key, offset, length);
1062     }
1063 
1064     @Override
1065     public ByteBuffer getValue() {
1066       assertValidSeek();
1067       return seeker.getValueShallowCopy();
1068     }
1069 
1070     @Override
1071     public KeyValue getKeyValue() {
1072       if (block == null) {
1073         return null;
1074       }
1075       return seeker.getKeyValue();
1076     }
1077 
1078     @Override
1079     public String getKeyString() {
1080       ByteBuffer keyBuffer = getKey();
1081       return Bytes.toStringBinary(keyBuffer.array(),
1082           keyBuffer.arrayOffset(), keyBuffer.limit());
1083     }
1084 
1085     @Override
1086     public String getValueString() {
1087       ByteBuffer valueBuffer = getValue();
1088       return Bytes.toStringBinary(valueBuffer.array(),
1089           valueBuffer.arrayOffset(), valueBuffer.limit());
1090     }
1091 
1092     private void assertValidSeek() {
1093       if (block == null) {
1094         throw new NotSeekedException();
1095       }
1096     }
1097 
1098     @Override
1099     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1100       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1101     }
1102 
1103     @Override
1104     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1105         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1106         throws IOException  {
1107       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1108         updateCurrentBlock(seekToBlock);
1109       } else if (rewind) {
1110         seeker.rewind();
1111       }
1112       this.nextIndexedKey = nextIndexedKey;
1113       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1114     }
1115   }
1116 
1117   /**
1118    * Returns a buffer with the Bloom filter metadata. The caller takes
1119    * ownership of the buffer.
1120    */
1121   @Override
1122   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1123     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1124   }
1125 
1126   @Override
1127   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1128     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1129   }
1130 
1131   private DataInput getBloomFilterMetadata(BlockType blockType)
1132   throws IOException {
1133     if (blockType != BlockType.GENERAL_BLOOM_META &&
1134         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1135       throw new RuntimeException("Block Type: " + blockType.toString() +
1136           " is not supported") ;
1137     }
1138 
1139     for (HFileBlock b : loadOnOpenBlocks)
1140       if (b.getBlockType() == blockType)
1141         return b.getByteStream();
1142     return null;
1143   }
1144 
1145   @Override
1146   public boolean isFileInfoLoaded() {
1147     return true; // We load file info in constructor in version 2.
1148   }
1149 
1150   /**
1151    * Validates that the minor version is within acceptable limits.
1152    * Otherwise throws an Runtime exception
1153    */
1154   private void validateMinorVersion(Path path, int minorVersion) {
1155     if (minorVersion < MIN_MINOR_VERSION ||
1156         minorVersion > MAX_MINOR_VERSION) {
1157       String msg = "Minor version for path " + path + 
1158                    " is expected to be between " +
1159                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1160                    " but is found to be " + minorVersion;
1161       LOG.error(msg);
1162       throw new RuntimeException(msg);
1163     }
1164   }
1165 }