View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.io.hfile;
19  
20  import java.io.DataInput;
21  import java.io.IOException;
22  import java.nio.ByteBuffer;
23  import java.util.ArrayList;
24  import java.util.List;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.hbase.classification.InterfaceAudience;
29  import org.apache.hadoop.conf.Configuration;
30  import org.apache.hadoop.fs.Path;
31  import org.apache.hadoop.hbase.HConstants;
32  import org.apache.hadoop.hbase.KeyValue;
33  import org.apache.hadoop.hbase.KeyValue.KVComparator;
34  import org.apache.hadoop.hbase.NoTagsKeyValue;
35  import org.apache.hadoop.hbase.fs.HFileSystem;
36  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
37  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
38  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
39  import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
40  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
41  import org.apache.hadoop.hbase.util.ByteBufferUtils;
42  import org.apache.hadoop.hbase.util.Bytes;
43  import org.apache.hadoop.hbase.util.IdLock;
44  import org.apache.hadoop.io.WritableUtils;
45  import org.cloudera.htrace.Trace;
46  import org.cloudera.htrace.TraceScope;
47  
48  import com.google.common.annotations.VisibleForTesting;
49  
50  /**
51   * {@link HFile} reader for version 2.
52   */
53  @InterfaceAudience.Private
54  public class HFileReaderV2 extends AbstractHFileReader {
55  
56    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
57  
58    /** Minor versions in HFile V2 starting with this number have hbase checksums */
59    public static final int MINOR_VERSION_WITH_CHECKSUM = 1;
60    /** In HFile V2 minor version that does not support checksums */
61    public static final int MINOR_VERSION_NO_CHECKSUM = 0;
62  
63    /** HFile minor version that introduced pbuf filetrailer */
64    public static final int PBUF_TRAILER_MINOR_VERSION = 2;
65  
66    /**
67     * The size of a (key length, value length) tuple that prefixes each entry in
68     * a data block.
69     */
70    public final static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
71  
72    protected boolean includesMemstoreTS = false;
73    protected boolean decodeMemstoreTS = false;
74    protected boolean shouldIncludeMemstoreTS() {
75      return includesMemstoreTS;
76    }
77  
78    /** Filesystem-level block reader. */
79    protected HFileBlock.FSReader fsBlockReader;
80  
81    /**
82     * A "sparse lock" implementation allowing to lock on a particular block
83     * identified by offset. The purpose of this is to avoid two clients loading
84     * the same block, and have all but one client wait to get the block from the
85     * cache.
86     */
87    private IdLock offsetLock = new IdLock();
88  
89    /**
90     * Blocks read from the load-on-open section, excluding data root index, meta
91     * index, and file info.
92     */
93    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
94  
95    /** Minimum minor version supported by this HFile format */
96    static final int MIN_MINOR_VERSION = 0;
97  
98    /** Maximum minor version supported by this HFile format */
99    // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
100   // the file. This version can read Writables version 1.
101   static final int MAX_MINOR_VERSION = 3;
102 
103   /** Minor versions starting with this number have faked index key */
104   static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
105 
106   protected HFileContext hfileContext;
107 
108   /**
109    * Opens a HFile. You must load the index before you can use it by calling
110    * {@link #loadFileInfo()}.
111    *
112    * @param path Path to HFile.
113    * @param trailer File trailer.
114    * @param fsdis input stream.
115    * @param size Length of the stream.
116    * @param cacheConf Cache configuration.
117    * @param hfs
118    * @param conf
119    */
120   public HFileReaderV2(final Path path, final FixedFileTrailer trailer,
121       final FSDataInputStreamWrapper fsdis, final long size, final CacheConfig cacheConf,
122       final HFileSystem hfs, final Configuration conf) throws IOException {
123     super(path, trailer, size, cacheConf, hfs, conf);
124     this.conf = conf;
125     trailer.expectMajorVersion(getMajorVersion());
126     validateMinorVersion(path, trailer.getMinorVersion());
127     this.hfileContext = createHFileContext(fsdis, fileSize, hfs, path, trailer);
128     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis, fileSize, hfs, path,
129         hfileContext);
130     this.fsBlockReader = fsBlockReaderV2; // upcast
131 
132     // Comparator class name is stored in the trailer in version 2.
133     comparator = trailer.createComparator();
134     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
135         trailer.getNumDataIndexLevels(), this);
136     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
137         KeyValue.RAW_COMPARATOR, 1);
138 
139     // Parse load-on-open data.
140 
141     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
142         trailer.getLoadOnOpenDataOffset(),
143         fileSize - trailer.getTrailerSize());
144 
145     // Data index. We also read statistics about the block index written after
146     // the root level.
147     dataBlockIndexReader.readMultiLevelIndexRoot(
148         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
149         trailer.getDataIndexCount());
150 
151     // Meta index.
152     metaBlockIndexReader.readRootIndex(
153         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
154         trailer.getMetaIndexCount());
155 
156     // File info
157     fileInfo = new FileInfo();
158     fileInfo.read(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
159     lastKey = fileInfo.get(FileInfo.LASTKEY);
160     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
161     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
162     byte [] keyValueFormatVersion =
163         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
164     includesMemstoreTS = keyValueFormatVersion != null &&
165         Bytes.toInt(keyValueFormatVersion) ==
166             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
167     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
168     if (includesMemstoreTS) {
169       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
170     }
171 
172     // Read data block encoding algorithm name from file info.
173     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
174     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
175 
176     // Store all other load-on-open blocks for further consumption.
177     HFileBlock b;
178     while ((b = blockIter.nextBlock()) != null) {
179       loadOnOpenBlocks.add(b);
180     }
181 
182     // Prefetch file blocks upon open if requested
183     if (cacheConf.shouldPrefetchOnOpen()) {
184       PrefetchExecutor.request(path, new Runnable() {
185         public void run() {
186           long offset = 0;
187           long end = 0;
188           try {
189             end = getTrailer().getLoadOnOpenDataOffset();
190             HFileBlock prevBlock = null;
191             if (LOG.isTraceEnabled()) {
192               LOG.trace("File=" + path.toString() + ", offset=" + offset + ", end=" + end);
193             }
194             while (offset < end) {
195               if (Thread.interrupted()) {
196                 break;
197               }
198               long onDiskSize = -1;
199               if (prevBlock != null) {
200                 onDiskSize = prevBlock.getNextBlockOnDiskSizeWithHeader();
201               }
202               HFileBlock block = readBlock(offset, onDiskSize, true, false, false, false, null);
203               prevBlock = block;
204               offset += block.getOnDiskSizeWithHeader();
205             }
206           } catch (IOException e) {
207             // IOExceptions are probably due to region closes (relocation, etc.)
208             if (LOG.isTraceEnabled()) {
209               LOG.trace("File=" + path.toString() + ", offset=" + offset + ", end=" + end, e);
210             }
211           } catch (Exception e) {
212             // Other exceptions are interesting
213             LOG.warn("File=" + path.toString() + ", offset=" + offset + ", end=" + end, e);
214           } finally {
215             PrefetchExecutor.complete(path);
216           }
217         }
218       });
219     }
220   }
221 
222   protected HFileContext createHFileContext(FSDataInputStreamWrapper fsdis, long fileSize,
223       HFileSystem hfs, Path path, FixedFileTrailer trailer) throws IOException {
224     return new HFileContextBuilder()
225       .withIncludesMvcc(this.includesMemstoreTS)
226       .withCompression(this.compressAlgo)
227       .withHBaseCheckSum(trailer.getMinorVersion() >= MINOR_VERSION_WITH_CHECKSUM)
228       .build();
229   }
230 
231   /**
232    * Create a Scanner on this file. No seeks or reads are done on creation. Call
233    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
234    * nothing to clean up in a Scanner. Letting go of your references to the
235    * scanner is sufficient.
236    *
237    * @param cacheBlocks True if we should cache blocks read in by this scanner.
238    * @param pread Use positional read rather than seek+read if true (pread is
239    *          better for random reads, seek+read is better scanning).
240    * @param isCompaction is scanner being used for a compaction?
241    * @return Scanner on this file.
242    */
243    @Override
244    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
245       final boolean isCompaction) {
246     if (dataBlockEncoder.useEncodedScanner()) {
247       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
248           hfileContext);
249     }
250 
251     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
252   }
253 
254   /**
255    * @param metaBlockName
256    * @param cacheBlock Add block to cache, if found
257    * @return block wrapped in a ByteBuffer, with header skipped
258    * @throws IOException
259    */
260   @Override
261   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
262       throws IOException {
263     if (trailer.getMetaIndexCount() == 0) {
264       return null; // there are no meta blocks
265     }
266     if (metaBlockIndexReader == null) {
267       throw new IOException("Meta index not loaded");
268     }
269 
270     byte[] mbname = Bytes.toBytes(metaBlockName);
271     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
272         mbname.length);
273     if (block == -1)
274       return null;
275     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
276 
277     // Per meta key from any given file, synchronize reads for said block. This
278     // is OK to do for meta blocks because the meta block index is always
279     // single-level.
280     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
281       // Check cache for block. If found return.
282       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
283       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
284           DataBlockEncoding.NONE, BlockType.META);
285 
286       cacheBlock &= cacheConf.shouldCacheDataOnRead();
287       if (cacheConf.isBlockCacheEnabled()) {
288         HFileBlock cachedBlock =
289           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false, true);
290         if (cachedBlock != null) {
291           assert cachedBlock.isUnpacked() : "Packed block leak.";
292           // Return a distinct 'shallow copy' of the block,
293           // so pos does not get messed by the scanner
294           return cachedBlock.getBufferWithoutHeader();
295         }
296         // Cache Miss, please load.
297       }
298 
299       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
300           blockSize, -1, true).unpack(hfileContext, fsBlockReader);
301 
302       // Cache the block
303       if (cacheBlock) {
304         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
305             cacheConf.isInMemory());
306       }
307 
308       return metaBlock.getBufferWithoutHeader();
309     }
310   }
311 
312   /**
313    * Read in a file block.
314    * @param dataBlockOffset offset to read.
315    * @param onDiskBlockSize size of the block
316    * @param cacheBlock
317    * @param pread Use positional read instead of seek+read (positional is
318    *          better doing random reads whereas seek+read is better scanning).
319    * @param isCompaction is this block being read as part of a compaction
320    * @param expectedBlockType the block type we are expecting to read with this
321    *          read operation, or null to read whatever block type is available
322    *          and avoid checking (that might reduce caching efficiency of
323    *          encoded data blocks)
324    * @return Block wrapped in a ByteBuffer.
325    * @throws IOException
326    */
327   @Override
328   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
329       final boolean cacheBlock, boolean pread, final boolean isCompaction,
330       final boolean updateCacheMetrics, BlockType expectedBlockType)
331       throws IOException {
332     if (dataBlockIndexReader == null) {
333       throw new IOException("Block index not loaded");
334     }
335     long trailerOffset = trailer.getLoadOnOpenDataOffset();
336     if (dataBlockOffset < 0 || dataBlockOffset >= trailerOffset) {
337       throw new IOException("Requested block is out of range: " + dataBlockOffset +
338         ", lastDataBlockOffset: " + trailer.getLastDataBlockOffset() +
339         ", trailer.getLoadOnOpenDataOffset: " + trailerOffset);
340     }
341     // For any given block from any given file, synchronize reads for said
342     // block.
343     // Without a cache, this synchronizing is needless overhead, but really
344     // the other choice is to duplicate work (which the cache would prevent you
345     // from doing).
346 
347     BlockCacheKey cacheKey =
348         new BlockCacheKey(name, dataBlockOffset,
349             dataBlockEncoder.getDataBlockEncoding(),
350             expectedBlockType);
351 
352     boolean useLock = false;
353     IdLock.Entry lockEntry = null;
354     TraceScope traceScope = Trace.startSpan("HFileReaderV2.readBlock");
355     try {
356       while (true) {
357         // Check cache for block. If found return.
358         if (cacheConf.shouldReadBlockFromCache(expectedBlockType)) {
359           if (useLock) {
360             lockEntry = offsetLock.getLockEntry(dataBlockOffset);
361           }
362           // Try and get the block from the block cache. If the useLock variable is true then this
363           // is the second time through the loop and it should not be counted as a block cache miss.
364           HFileBlock cachedBlock = (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, 
365             cacheBlock, useLock, updateCacheMetrics);
366           if (cachedBlock != null) {
367             if (cacheConf.shouldCacheCompressed(cachedBlock.getBlockType().getCategory())) {
368               cachedBlock = cachedBlock.unpack(hfileContext, fsBlockReader);
369             }
370             if (Trace.isTracing()) {
371               traceScope.getSpan().addTimelineAnnotation("blockCacheHit");
372             }
373             assert cachedBlock.isUnpacked() : "Packed block leak.";
374             if (cachedBlock.getBlockType().isData()) {
375               HFile.dataBlockReadCnt.incrementAndGet();
376 
377               // Validate encoding type for data blocks. We include encoding
378               // type in the cache key, and we expect it to match on a cache hit.
379               if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
380                 throw new IOException("Cached block under key " + cacheKey + " "
381                   + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
382                   + dataBlockEncoder.getDataBlockEncoding() + ")");
383               }
384             }
385             return cachedBlock;
386           }
387           if (!useLock && cacheBlock && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
388             // check cache again with lock
389             useLock = true;
390             continue;
391           }
392           // Carry on, please load.
393         }
394 
395         if (Trace.isTracing()) {
396           traceScope.getSpan().addTimelineAnnotation("blockCacheMiss");
397         }
398         // Load block from filesystem.
399         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, -1,
400             pread);
401         validateBlockType(hfileBlock, expectedBlockType);
402         HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
403         BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory();
404 
405         // Cache the block if necessary
406         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
407           cacheConf.getBlockCache().cacheBlock(cacheKey,
408             cacheConf.shouldCacheCompressed(category) ? hfileBlock : unpacked,
409             cacheConf.isInMemory());
410         }
411 
412         if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
413           HFile.dataBlockReadCnt.incrementAndGet();
414         }
415 
416         return unpacked;
417       }
418     } finally {
419       traceScope.close();
420       if (lockEntry != null) {
421         offsetLock.releaseLockEntry(lockEntry);
422       }
423     }
424   }
425 
426   @Override
427   public boolean hasMVCCInfo() {
428     return includesMemstoreTS && decodeMemstoreTS;
429   }
430 
431   /**
432    * Compares the actual type of a block retrieved from cache or disk with its
433    * expected type and throws an exception in case of a mismatch. Expected
434    * block type of {@link BlockType#DATA} is considered to match the actual
435    * block type [@link {@link BlockType#ENCODED_DATA} as well.
436    * @param block a block retrieved from cache or disk
437    * @param expectedBlockType the expected block type, or null to skip the
438    *          check
439    */
440   private void validateBlockType(HFileBlock block,
441       BlockType expectedBlockType) throws IOException {
442     if (expectedBlockType == null) {
443       return;
444     }
445     BlockType actualBlockType = block.getBlockType();
446     if (actualBlockType == BlockType.ENCODED_DATA &&
447         expectedBlockType == BlockType.DATA) {
448       // We consider DATA to match ENCODED_DATA for the purpose of this
449       // verification.
450       return;
451     }
452     if (actualBlockType != expectedBlockType) {
453       throw new IOException("Expected block type " + expectedBlockType + ", " +
454           "but got " + actualBlockType + ": " + block);
455     }
456   }
457 
458   /**
459    * @return Last key in the file. May be null if file has no entries. Note that
460    *         this is not the last row key, but rather the byte form of the last
461    *         KeyValue.
462    */
463   @Override
464   public byte[] getLastKey() {
465     return dataBlockIndexReader.isEmpty() ? null : lastKey;
466   }
467 
468   /**
469    * @return Midkey for this file. We work with block boundaries only so
470    *         returned midkey is an approximation only.
471    * @throws IOException
472    */
473   @Override
474   public byte[] midkey() throws IOException {
475     return dataBlockIndexReader.midkey();
476   }
477 
478   @Override
479   public void close() throws IOException {
480     close(cacheConf.shouldEvictOnClose());
481   }
482 
483   public void close(boolean evictOnClose) throws IOException {
484     PrefetchExecutor.cancel(path);
485     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
486       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
487       if (LOG.isTraceEnabled()) {
488         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
489           + " block(s)");
490       }
491     }
492     fsBlockReader.closeStreams();
493   }
494 
495   /** For testing */
496   @Override
497   HFileBlock.FSReader getUncachedBlockReader() {
498     return fsBlockReader;
499   }
500 
501 
502   protected abstract static class AbstractScannerV2
503       extends AbstractHFileReader.Scanner {
504     protected HFileBlock block;
505 
506     @Override
507     public byte[] getNextIndexedKey() {
508       return nextIndexedKey;
509     }
510     /**
511      * The next indexed key is to keep track of the indexed key of the next data block.
512      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
513      * current data block is the last data block.
514      *
515      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
516      */
517     protected byte[] nextIndexedKey;
518 
519     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
520         final boolean pread, final boolean isCompaction) {
521       super(r, cacheBlocks, pread, isCompaction);
522     }
523 
524     /**
525      * An internal API function. Seek to the given key, optionally rewinding to
526      * the first key of the block before doing the seek.
527      *
528      * @param key key byte array
529      * @param offset key offset in the key byte array
530      * @param length key length
531      * @param rewind whether to rewind to the first key of the block before
532      *        doing the seek. If this is false, we are assuming we never go
533      *        back, otherwise the result is undefined.
534      * @return -1 if the key is earlier than the first key of the file,
535      *         0 if we are at the given key, 1 if we are past the given key
536      *         -2 if the key is earlier than the first key of the file while
537      *         using a faked index key
538      * @throws IOException
539      */
540     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
541         throws IOException {
542       HFileBlockIndex.BlockIndexReader indexReader =
543           reader.getDataBlockIndexReader();
544       BlockWithScanInfo blockWithScanInfo =
545         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
546             cacheBlocks, pread, isCompaction);
547       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
548         // This happens if the key e.g. falls before the beginning of the file.
549         return -1;
550       }
551       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
552           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
553     }
554 
555     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
556 
557     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
558         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
559         throws IOException;
560 
561     @Override
562     public int seekTo(byte[] key, int offset, int length) throws IOException {
563       // Always rewind to the first key of the block, because the given key
564       // might be before or after the current key.
565       return seekTo(key, offset, length, true);
566     }
567 
568     @Override
569     public int reseekTo(byte[] key, int offset, int length) throws IOException {
570       int compared;
571       if (isSeeked()) {
572         compared = compareKey(reader.getComparator(), key, offset, length);
573         if (compared < 1) {
574           // If the required key is less than or equal to current key, then
575           // don't do anything.
576           return compared;
577         } else {
578           if (this.nextIndexedKey != null &&
579               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
580                reader.getComparator().compareFlatKey(key, offset, length,
581                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
582             // The reader shall continue to scan the current data block instead of querying the
583             // block index as long as it knows the target key is strictly smaller than
584             // the next indexed key or the current data block is the last data block.
585             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
586                 false, key, offset, length, false);
587           }
588         }
589       }
590       // Don't rewind on a reseek operation, because reseek implies that we are
591       // always going forward in the file.
592       return seekTo(key, offset, length, false);
593     }
594 
595     @Override
596     public boolean seekBefore(byte[] key, int offset, int length)
597         throws IOException {
598       HFileBlock seekToBlock =
599           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
600               block, cacheBlocks, pread, isCompaction);
601       if (seekToBlock == null) {
602         return false;
603       }
604       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
605 
606       if (reader.getComparator().compareFlatKey(firstKey.array(),
607           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) >= 0)
608       {
609         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
610         // The key we are interested in
611         if (previousBlockOffset == -1) {
612           // we have a 'problem', the key we want is the first of the file.
613           return false;
614         }
615 
616         // It is important that we compute and pass onDiskSize to the block
617         // reader so that it does not have to read the header separately to
618         // figure out the size.  Currently, we do not have a way to do this
619         // correctly in the general case however.
620         // TODO: See https://issues.apache.org/jira/browse/HBASE-14576
621         int prevBlockSize = -1;
622         seekToBlock = reader.readBlock(previousBlockOffset,
623             prevBlockSize, cacheBlocks,
624             pread, isCompaction, true, BlockType.DATA);
625         // TODO shortcut: seek forward in this block to the last key of the
626         // block.
627       }
628       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
629       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
630       return true;
631     }
632 
633 
634     /**
635      * Scans blocks in the "scanned" section of the {@link HFile} until the next
636      * data block is found.
637      *
638      * @return the next block, or null if there are no more data blocks
639      * @throws IOException
640      */
641     protected HFileBlock readNextDataBlock() throws IOException {
642       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
643       if (block == null)
644         return null;
645 
646       HFileBlock curBlock = block;
647 
648       do {
649         if (curBlock.getOffset() >= lastDataBlockOffset)
650           return null;
651 
652         if (curBlock.getOffset() < 0) {
653           throw new IOException("Invalid block file offset: " + block);
654         }
655 
656         // We are reading the next block without block type validation, because
657         // it might turn out to be a non-data block.
658         curBlock = reader.readBlock(curBlock.getOffset()
659             + curBlock.getOnDiskSizeWithHeader(),
660             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
661             isCompaction, true, null);
662       } while (!curBlock.getBlockType().isData());
663 
664       return curBlock;
665     }
666     /**
667      * Compare the given key against the current key
668      * @param comparator
669      * @param key
670      * @param offset
671      * @param length
672      * @return -1 is the passed key is smaller than the current key, 0 if equal and 1 if greater
673      */
674     public abstract int compareKey(KVComparator comparator, byte[] key, int offset,
675         int length);
676   }
677 
678   /**
679    * Implementation of {@link HFileScanner} interface.
680    */
681   protected static class ScannerV2 extends AbstractScannerV2 {
682     private HFileReaderV2 reader;
683 
684     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
685         final boolean pread, final boolean isCompaction) {
686       super(r, cacheBlocks, pread, isCompaction);
687       this.reader = r;
688     }
689 
690     @Override
691     public KeyValue getKeyValue() {
692       if (!isSeeked())
693         return null;
694 
695       // HFile V2 do not support tags.
696       return formNoTagsKeyValue();
697     }
698 
699     protected KeyValue formNoTagsKeyValue() {
700       KeyValue ret = new NoTagsKeyValue(blockBuffer.array(), blockBuffer.arrayOffset()
701           + blockBuffer.position(), getCellBufSize());
702       if (this.reader.shouldIncludeMemstoreTS()) {
703         ret.setMvccVersion(currMemstoreTS);
704       }
705       return ret;
706     }
707 
708     protected int getCellBufSize() {
709       return KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen;
710     }
711 
712     @Override
713     public ByteBuffer getKey() {
714       assertSeeked();
715       return ByteBuffer.wrap(
716           blockBuffer.array(),
717           blockBuffer.arrayOffset() + blockBuffer.position()
718               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
719     }
720 
721     @Override
722     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
723       return comparator.compareFlatKey(key, offset, length, blockBuffer.array(),
724           blockBuffer.arrayOffset() + blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen);
725     }
726 
727     @Override
728     public ByteBuffer getValue() {
729       assertSeeked();
730       return ByteBuffer.wrap(
731           blockBuffer.array(),
732           blockBuffer.arrayOffset() + blockBuffer.position()
733               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
734     }
735 
736     protected void setNonSeekedState() {
737       block = null;
738       blockBuffer = null;
739       currKeyLen = 0;
740       currValueLen = 0;
741       currMemstoreTS = 0;
742       currMemstoreTSLen = 0;
743     }
744 
745     /**
746      * Go to the next key/value in the block section. Loads the next block if
747      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
748      * be called.
749      *
750      * @return true if successfully navigated to the next key/value
751      */
752     @Override
753     public boolean next() throws IOException {
754       assertSeeked();
755 
756       try {
757         blockBuffer.position(getNextCellStartPosition());
758       } catch (IllegalArgumentException e) {
759         LOG.error("Current pos = " + blockBuffer.position()
760             + "; currKeyLen = " + currKeyLen + "; currValLen = "
761             + currValueLen + "; block limit = " + blockBuffer.limit()
762             + "; HFile name = " + reader.getName()
763             + "; currBlock currBlockOffset = " + block.getOffset());
764         throw e;
765       }
766 
767       if (blockBuffer.remaining() <= 0) {
768         long lastDataBlockOffset =
769             reader.getTrailer().getLastDataBlockOffset();
770 
771         if (block.getOffset() >= lastDataBlockOffset) {
772           setNonSeekedState();
773           return false;
774         }
775 
776         // read the next block
777         HFileBlock nextBlock = readNextDataBlock();
778         if (nextBlock == null) {
779           setNonSeekedState();
780           return false;
781         }
782 
783         updateCurrBlock(nextBlock);
784         return true;
785       }
786 
787       // We are still in the same block.
788       readKeyValueLen();
789       return true;
790     }
791 
792     protected int getNextCellStartPosition() {
793       return blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen
794           + currMemstoreTSLen;
795     }
796 
797     /**
798      * Positions this scanner at the start of the file.
799      *
800      * @return false if empty file; i.e. a call to next would return false and
801      *         the current key and value are undefined.
802      * @throws IOException
803      */
804     @Override
805     public boolean seekTo() throws IOException {
806       if (reader == null) {
807         return false;
808       }
809 
810       if (reader.getTrailer().getEntryCount() == 0) {
811         // No data blocks.
812         return false;
813       }
814 
815       long firstDataBlockOffset =
816           reader.getTrailer().getFirstDataBlockOffset();
817       if (block != null && block.getOffset() == firstDataBlockOffset) {
818         blockBuffer.rewind();
819         readKeyValueLen();
820         return true;
821       }
822 
823       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
824           isCompaction, true, BlockType.DATA);
825       if (block.getOffset() < 0) {
826         throw new IOException("Invalid block offset: " + block.getOffset());
827       }
828       updateCurrBlock(block);
829       return true;
830     }
831 
832     @Override
833     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
834         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
835         throws IOException {
836       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
837         updateCurrBlock(seekToBlock);
838       } else if (rewind) {
839         blockBuffer.rewind();
840       }
841 
842       // Update the nextIndexedKey
843       this.nextIndexedKey = nextIndexedKey;
844       return blockSeek(key, offset, length, seekBefore);
845     }
846 
847     /**
848      * Updates the current block to be the given {@link HFileBlock}. Seeks to
849      * the the first key/value pair.
850      *
851      * @param newBlock the block to make current
852      */
853     protected void updateCurrBlock(HFileBlock newBlock) {
854       block = newBlock;
855 
856       // sanity check
857       if (block.getBlockType() != BlockType.DATA) {
858         throw new IllegalStateException("ScannerV2 works only on data " +
859             "blocks, got " + block.getBlockType() + "; " +
860             "fileName=" + reader.name + ", " +
861             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
862             "isCompaction=" + isCompaction);
863       }
864 
865       blockBuffer = block.getBufferWithoutHeader();
866       readKeyValueLen();
867       blockFetches++;
868 
869       // Reset the next indexed key
870       this.nextIndexedKey = null;
871     }
872 
873     protected void readKeyValueLen() {
874       blockBuffer.mark();
875       currKeyLen = blockBuffer.getInt();
876       currValueLen = blockBuffer.getInt();
877       ByteBufferUtils.skip(blockBuffer, currKeyLen + currValueLen);
878       readMvccVersion();
879       if (currKeyLen < 0 || currValueLen < 0
880           || currKeyLen > blockBuffer.limit()
881           || currValueLen > blockBuffer.limit()) {
882         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
883             + " or currValueLen " + currValueLen + ". Block offset: "
884             + block.getOffset() + ", block length: " + blockBuffer.limit()
885             + ", position: " + blockBuffer.position() + " (without header).");
886       }
887       blockBuffer.reset();
888     }
889 
890     protected void readMvccVersion() {
891       if (this.reader.shouldIncludeMemstoreTS()) {
892         if (this.reader.decodeMemstoreTS) {
893           currMemstoreTS = Bytes.readAsVLong(blockBuffer.array(), blockBuffer.arrayOffset()
894               + blockBuffer.position());
895           currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
896         } else {
897           currMemstoreTS = 0;
898           currMemstoreTSLen = 1;
899         }
900       }
901     }
902 
903     /**
904      * Within a loaded block, seek looking for the last key that is smaller
905      * than (or equal to?) the key we are interested in.
906      *
907      * A note on the seekBefore: if you have seekBefore = true, AND the first
908      * key in the block = key, then you'll get thrown exceptions. The caller has
909      * to check for that case and load the previous block as appropriate.
910      *
911      * @param key the key to find
912      * @param seekBefore find the key before the given key in case of exact
913      *          match.
914      * @return 0 in case of an exact key match, 1 in case of an inexact match,
915      *         -2 in case of an inexact match and furthermore, the input key less
916      *         than the first key of current block(e.g. using a faked index key)
917      */
918     protected int blockSeek(byte[] key, int offset, int length,
919         boolean seekBefore) {
920       int klen, vlen;
921       long memstoreTS = 0;
922       int memstoreTSLen = 0;
923       int lastKeyValueSize = -1;
924       do {
925         blockBuffer.mark();
926         klen = blockBuffer.getInt();
927         vlen = blockBuffer.getInt();
928         blockBuffer.reset();
929         if (this.reader.shouldIncludeMemstoreTS()) {
930           if (this.reader.decodeMemstoreTS) {
931             int memstoreTSOffset = blockBuffer.arrayOffset() + blockBuffer.position()
932                 + KEY_VALUE_LEN_SIZE + klen + vlen;
933             memstoreTS = Bytes.readAsVLong(blockBuffer.array(), memstoreTSOffset);
934             memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
935           } else {
936             memstoreTS = 0;
937             memstoreTSLen = 1;
938           }
939         }
940 
941         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
942             + KEY_VALUE_LEN_SIZE;
943         int comp = reader.getComparator().compareFlatKey(key, offset, length,
944             blockBuffer.array(), keyOffset, klen);
945 
946         if (comp == 0) {
947           if (seekBefore) {
948             if (lastKeyValueSize < 0) {
949               throw new IllegalStateException("blockSeek with seekBefore "
950                   + "at the first key of the block: key="
951                   + Bytes.toStringBinary(key) + ", blockOffset="
952                   + block.getOffset() + ", onDiskSize="
953                   + block.getOnDiskSizeWithHeader());
954             }
955             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
956             readKeyValueLen();
957             return 1; // non exact match.
958           }
959           currKeyLen = klen;
960           currValueLen = vlen;
961           if (this.reader.shouldIncludeMemstoreTS()) {
962             currMemstoreTS = memstoreTS;
963             currMemstoreTSLen = memstoreTSLen;
964           }
965           return 0; // indicate exact match
966         } else if (comp < 0) {
967           if (lastKeyValueSize > 0)
968             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
969           readKeyValueLen();
970           if (lastKeyValueSize == -1 && blockBuffer.position() == 0
971               && this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY) {
972             return HConstants.INDEX_KEY_MAGIC;
973           }
974           return 1;
975         }
976 
977         // The size of this key/value tuple, including key/value length fields.
978         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
979         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
980       } while (blockBuffer.remaining() > 0);
981 
982       // Seek to the last key we successfully read. This will happen if this is
983       // the last key/value pair in the file, in which case the following call
984       // to next() has to return false.
985       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
986       readKeyValueLen();
987       return 1; // didn't exactly find it.
988     }
989 
990     @Override
991     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
992       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
993       // It is safe to manipulate this buffer because we own the buffer object.
994       buffer.rewind();
995       int klen = buffer.getInt();
996       buffer.getInt();
997       ByteBuffer keyBuff = buffer.slice();
998       keyBuff.limit(klen);
999       keyBuff.rewind();
1000       return keyBuff;
1001     }
1002 
1003     @Override
1004     public String getKeyString() {
1005       return Bytes.toStringBinary(blockBuffer.array(),
1006           blockBuffer.arrayOffset() + blockBuffer.position()
1007               + KEY_VALUE_LEN_SIZE, currKeyLen);
1008     }
1009 
1010     @Override
1011     public String getValueString() {
1012       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
1013           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
1014           currValueLen);
1015     }
1016   }
1017 
1018   /**
1019    * ScannerV2 that operates on encoded data blocks.
1020    */
1021   protected static class EncodedScannerV2 extends AbstractScannerV2 {
1022     private final HFileBlockDecodingContext decodingCtx;
1023     private final DataBlockEncoder.EncodedSeeker seeker;
1024     private final DataBlockEncoder dataBlockEncoder;
1025     protected final HFileContext meta;
1026 
1027     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
1028         boolean pread, boolean isCompaction, HFileContext meta) {
1029       super(reader, cacheBlocks, pread, isCompaction);
1030       DataBlockEncoding encoding = reader.dataBlockEncoder.getDataBlockEncoding();
1031       dataBlockEncoder = encoding.getEncoder();
1032       decodingCtx = dataBlockEncoder.newDataBlockDecodingContext(meta);
1033       seeker = dataBlockEncoder.createSeeker(
1034         reader.getComparator(), decodingCtx);
1035       this.meta = meta;
1036     }
1037 
1038     @Override
1039     public boolean isSeeked(){
1040       return this.block != null;
1041     }
1042 
1043     /**
1044      * Updates the current block to be the given {@link HFileBlock}. Seeks to
1045      * the the first key/value pair.
1046      *
1047      * @param newBlock the block to make current
1048      * @throws CorruptHFileException
1049      */
1050     private void updateCurrentBlock(HFileBlock newBlock) throws CorruptHFileException {
1051       block = newBlock;
1052 
1053       // sanity checks
1054       if (block.getBlockType() != BlockType.ENCODED_DATA) {
1055         throw new IllegalStateException(
1056             "EncodedScanner works only on encoded data blocks");
1057       }
1058       short dataBlockEncoderId = block.getDataBlockEncodingId();
1059       if (!DataBlockEncoding.isCorrectEncoder(dataBlockEncoder, dataBlockEncoderId)) {
1060         String encoderCls = dataBlockEncoder.getClass().getName();
1061         throw new CorruptHFileException("Encoder " + encoderCls
1062           + " doesn't support data block encoding "
1063           + DataBlockEncoding.getNameFromId(dataBlockEncoderId));
1064       }
1065 
1066       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
1067       blockFetches++;
1068 
1069       // Reset the next indexed key
1070       this.nextIndexedKey = null;
1071     }
1072 
1073     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
1074       ByteBuffer origBlock = newBlock.getBufferReadOnly();
1075       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
1076           origBlock.arrayOffset() + newBlock.headerSize() +
1077           DataBlockEncoding.ID_SIZE,
1078           newBlock.getUncompressedSizeWithoutHeader() -
1079           DataBlockEncoding.ID_SIZE).slice();
1080       return encodedBlock;
1081     }
1082 
1083     @Override
1084     public boolean seekTo() throws IOException {
1085       if (reader == null) {
1086         return false;
1087       }
1088 
1089       if (reader.getTrailer().getEntryCount() == 0) {
1090         // No data blocks.
1091         return false;
1092       }
1093 
1094       long firstDataBlockOffset =
1095           reader.getTrailer().getFirstDataBlockOffset();
1096       if (block != null && block.getOffset() == firstDataBlockOffset) {
1097         seeker.rewind();
1098         return true;
1099       }
1100 
1101       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1102           isCompaction, true, BlockType.DATA);
1103       if (block.getOffset() < 0) {
1104         throw new IOException("Invalid block offset: " + block.getOffset());
1105       }
1106       updateCurrentBlock(block);
1107       return true;
1108     }
1109 
1110     @Override
1111     public boolean next() throws IOException {
1112       boolean isValid = seeker.next();
1113       if (!isValid) {
1114         block = readNextDataBlock();
1115         isValid = block != null;
1116         if (isValid) {
1117           updateCurrentBlock(block);
1118         }
1119       }
1120       return isValid;
1121     }
1122 
1123     @Override
1124     public ByteBuffer getKey() {
1125       assertValidSeek();
1126       return seeker.getKeyDeepCopy();
1127     }
1128 
1129     @Override
1130     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
1131       return seeker.compareKey(comparator, key, offset, length);
1132     }
1133 
1134     @Override
1135     public ByteBuffer getValue() {
1136       assertValidSeek();
1137       return seeker.getValueShallowCopy();
1138     }
1139 
1140     @Override
1141     public KeyValue getKeyValue() {
1142       if (block == null) {
1143         return null;
1144       }
1145       return seeker.getKeyValue();
1146     }
1147 
1148     @Override
1149     public String getKeyString() {
1150       ByteBuffer keyBuffer = getKey();
1151       return Bytes.toStringBinary(keyBuffer.array(),
1152           keyBuffer.arrayOffset(), keyBuffer.limit());
1153     }
1154 
1155     @Override
1156     public String getValueString() {
1157       ByteBuffer valueBuffer = getValue();
1158       return Bytes.toStringBinary(valueBuffer.array(),
1159           valueBuffer.arrayOffset(), valueBuffer.limit());
1160     }
1161 
1162     private void assertValidSeek() {
1163       if (block == null) {
1164         throw new NotSeekedException();
1165       }
1166     }
1167 
1168     @Override
1169     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1170       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1171     }
1172 
1173     @Override
1174     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1175         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1176         throws IOException  {
1177       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1178         updateCurrentBlock(seekToBlock);
1179       } else if (rewind) {
1180         seeker.rewind();
1181       }
1182       this.nextIndexedKey = nextIndexedKey;
1183       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1184     }
1185   }
1186 
1187   /**
1188    * Returns a buffer with the Bloom filter metadata. The caller takes
1189    * ownership of the buffer.
1190    */
1191   @Override
1192   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1193     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1194   }
1195 
1196   @Override
1197   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1198     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1199   }
1200 
1201   private DataInput getBloomFilterMetadata(BlockType blockType)
1202   throws IOException {
1203     if (blockType != BlockType.GENERAL_BLOOM_META &&
1204         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1205       throw new RuntimeException("Block Type: " + blockType.toString() +
1206           " is not supported") ;
1207     }
1208 
1209     for (HFileBlock b : loadOnOpenBlocks)
1210       if (b.getBlockType() == blockType)
1211         return b.getByteStream();
1212     return null;
1213   }
1214 
1215   @Override
1216   public boolean isFileInfoLoaded() {
1217     return true; // We load file info in constructor in version 2.
1218   }
1219 
1220   /**
1221    * Validates that the minor version is within acceptable limits.
1222    * Otherwise throws an Runtime exception
1223    */
1224   private void validateMinorVersion(Path path, int minorVersion) {
1225     if (minorVersion < MIN_MINOR_VERSION ||
1226         minorVersion > MAX_MINOR_VERSION) {
1227       String msg = "Minor version for path " + path + 
1228                    " is expected to be between " +
1229                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1230                    " but is found to be " + minorVersion;
1231       LOG.error(msg);
1232       throw new RuntimeException(msg);
1233     }
1234   }
1235 
1236   @Override
1237   public int getMajorVersion() {
1238     return 2;
1239   }
1240 
1241   @Override
1242   public HFileContext getFileContext() {
1243     return hfileContext;
1244   }
1245 
1246   /**
1247    * Returns false if block prefetching was requested for this file and has
1248    * not completed, true otherwise
1249    */
1250   @VisibleForTesting
1251   boolean prefetchComplete() {
1252     return PrefetchExecutor.isCompleted(path);
1253   }
1254 }