View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.io.hfile;
19  
20  import java.io.DataInput;
21  import java.io.IOException;
22  import java.nio.ByteBuffer;
23  import java.util.ArrayList;
24  import java.util.List;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.hbase.classification.InterfaceAudience;
29  import org.apache.hadoop.conf.Configuration;
30  import org.apache.hadoop.fs.Path;
31  import org.apache.hadoop.hbase.HConstants;
32  import org.apache.hadoop.hbase.KeyValue;
33  import org.apache.hadoop.hbase.KeyValue.KVComparator;
34  import org.apache.hadoop.hbase.NoTagsKeyValue;
35  import org.apache.hadoop.hbase.fs.HFileSystem;
36  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
37  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
38  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
39  import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
40  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
41  import org.apache.hadoop.hbase.util.ByteBufferUtils;
42  import org.apache.hadoop.hbase.util.Bytes;
43  import org.apache.hadoop.hbase.util.IdLock;
44  import org.apache.hadoop.io.WritableUtils;
45  import org.cloudera.htrace.Trace;
46  import org.cloudera.htrace.TraceScope;
47  
48  import com.google.common.annotations.VisibleForTesting;
49  
50  /**
51   * {@link HFile} reader for version 2.
52   */
53  @InterfaceAudience.Private
54  public class HFileReaderV2 extends AbstractHFileReader {
55  
56    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
57  
58    /** Minor versions in HFile V2 starting with this number have hbase checksums */
59    public static final int MINOR_VERSION_WITH_CHECKSUM = 1;
60    /** In HFile V2 minor version that does not support checksums */
61    public static final int MINOR_VERSION_NO_CHECKSUM = 0;
62  
63    /** HFile minor version that introduced pbuf filetrailer */
64    public static final int PBUF_TRAILER_MINOR_VERSION = 2;
65  
66    /**
67     * The size of a (key length, value length) tuple that prefixes each entry in
68     * a data block.
69     */
70    public final static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
71  
72    protected boolean includesMemstoreTS = false;
73    protected boolean decodeMemstoreTS = false;
74    protected boolean shouldIncludeMemstoreTS() {
75      return includesMemstoreTS;
76    }
77  
78    /** Filesystem-level block reader. */
79    protected HFileBlock.FSReader fsBlockReader;
80  
81    /**
82     * A "sparse lock" implementation allowing to lock on a particular block
83     * identified by offset. The purpose of this is to avoid two clients loading
84     * the same block, and have all but one client wait to get the block from the
85     * cache.
86     */
87    private IdLock offsetLock = new IdLock();
88  
89    /**
90     * Blocks read from the load-on-open section, excluding data root index, meta
91     * index, and file info.
92     */
93    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
94  
95    /** Minimum minor version supported by this HFile format */
96    static final int MIN_MINOR_VERSION = 0;
97  
98    /** Maximum minor version supported by this HFile format */
99    // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
100   // the file. This version can read Writables version 1.
101   static final int MAX_MINOR_VERSION = 3;
102 
103   /** Minor versions starting with this number have faked index key */
104   static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
105 
106   protected HFileContext hfileContext;
107 
108   /**
109    * Opens a HFile. You must load the index before you can use it by calling
110    * {@link #loadFileInfo()}.
111    *
112    * @param path Path to HFile.
113    * @param trailer File trailer.
114    * @param fsdis input stream.
115    * @param size Length of the stream.
116    * @param cacheConf Cache configuration.
117    * @param hfs
118    * @param conf
119    */
120   public HFileReaderV2(final Path path, final FixedFileTrailer trailer,
121       final FSDataInputStreamWrapper fsdis, final long size, final CacheConfig cacheConf,
122       final HFileSystem hfs, final Configuration conf) throws IOException {
123     super(path, trailer, size, cacheConf, hfs, conf);
124     this.conf = conf;
125     trailer.expectMajorVersion(getMajorVersion());
126     validateMinorVersion(path, trailer.getMinorVersion());
127     this.hfileContext = createHFileContext(fsdis, fileSize, hfs, path, trailer);
128     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis, fileSize, hfs, path,
129         hfileContext);
130     this.fsBlockReader = fsBlockReaderV2; // upcast
131 
132     // Comparator class name is stored in the trailer in version 2.
133     comparator = trailer.createComparator();
134     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
135         trailer.getNumDataIndexLevels(), this);
136     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
137         KeyValue.RAW_COMPARATOR, 1);
138 
139     // Parse load-on-open data.
140 
141     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
142         trailer.getLoadOnOpenDataOffset(),
143         fileSize - trailer.getTrailerSize());
144 
145     // Data index. We also read statistics about the block index written after
146     // the root level.
147     dataBlockIndexReader.readMultiLevelIndexRoot(
148         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
149         trailer.getDataIndexCount());
150 
151     // Meta index.
152     metaBlockIndexReader.readRootIndex(
153         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
154         trailer.getMetaIndexCount());
155 
156     // File info
157     fileInfo = new FileInfo();
158     fileInfo.read(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
159     lastKey = fileInfo.get(FileInfo.LASTKEY);
160     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
161     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
162     byte [] keyValueFormatVersion =
163         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
164     includesMemstoreTS = keyValueFormatVersion != null &&
165         Bytes.toInt(keyValueFormatVersion) ==
166             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
167     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
168     if (includesMemstoreTS) {
169       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
170     }
171 
172     // Read data block encoding algorithm name from file info.
173     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
174     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
175 
176     // Store all other load-on-open blocks for further consumption.
177     HFileBlock b;
178     while ((b = blockIter.nextBlock()) != null) {
179       loadOnOpenBlocks.add(b);
180     }
181 
182     // Prefetch file blocks upon open if requested
183     if (cacheConf.shouldPrefetchOnOpen()) {
184       PrefetchExecutor.request(path, new Runnable() {
185         public void run() {
186           try {
187             long offset = 0;
188             long end = fileSize - getTrailer().getTrailerSize();
189             HFileBlock prevBlock = null;
190             while (offset < end) {
191               if (Thread.interrupted()) {
192                 break;
193               }
194               long onDiskSize = -1;
195               if (prevBlock != null) {
196                 onDiskSize = prevBlock.getNextBlockOnDiskSizeWithHeader();
197               }
198               HFileBlock block = readBlock(offset, onDiskSize, true, false, false, false, null);
199               prevBlock = block;
200               offset += block.getOnDiskSizeWithHeader();
201             }
202           } catch (IOException e) {
203             // IOExceptions are probably due to region closes (relocation, etc.)
204             if (LOG.isTraceEnabled()) {
205               LOG.trace("Exception encountered while prefetching " + path + ":", e);
206             }
207           } catch (Exception e) {
208             // Other exceptions are interesting
209             LOG.warn("Exception encountered while prefetching " + path + ":", e);
210           } finally {
211             PrefetchExecutor.complete(path);
212           }
213         }
214       });
215     }
216   }
217 
218   protected HFileContext createHFileContext(FSDataInputStreamWrapper fsdis, long fileSize,
219       HFileSystem hfs, Path path, FixedFileTrailer trailer) throws IOException {
220     return new HFileContextBuilder()
221       .withIncludesMvcc(this.includesMemstoreTS)
222       .withCompression(this.compressAlgo)
223       .withHBaseCheckSum(trailer.getMinorVersion() >= MINOR_VERSION_WITH_CHECKSUM)
224       .build();
225   }
226 
227   /**
228    * Create a Scanner on this file. No seeks or reads are done on creation. Call
229    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
230    * nothing to clean up in a Scanner. Letting go of your references to the
231    * scanner is sufficient.
232    *
233    * @param cacheBlocks True if we should cache blocks read in by this scanner.
234    * @param pread Use positional read rather than seek+read if true (pread is
235    *          better for random reads, seek+read is better scanning).
236    * @param isCompaction is scanner being used for a compaction?
237    * @return Scanner on this file.
238    */
239    @Override
240    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
241       final boolean isCompaction) {
242     if (dataBlockEncoder.useEncodedScanner()) {
243       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
244           hfileContext);
245     }
246 
247     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
248   }
249 
250   /**
251    * @param metaBlockName
252    * @param cacheBlock Add block to cache, if found
253    * @return block wrapped in a ByteBuffer, with header skipped
254    * @throws IOException
255    */
256   @Override
257   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
258       throws IOException {
259     if (trailer.getMetaIndexCount() == 0) {
260       return null; // there are no meta blocks
261     }
262     if (metaBlockIndexReader == null) {
263       throw new IOException("Meta index not loaded");
264     }
265 
266     byte[] mbname = Bytes.toBytes(metaBlockName);
267     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
268         mbname.length);
269     if (block == -1)
270       return null;
271     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
272 
273     // Per meta key from any given file, synchronize reads for said block. This
274     // is OK to do for meta blocks because the meta block index is always
275     // single-level.
276     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
277       // Check cache for block. If found return.
278       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
279       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
280           DataBlockEncoding.NONE, BlockType.META);
281 
282       cacheBlock &= cacheConf.shouldCacheDataOnRead();
283       if (cacheConf.isBlockCacheEnabled()) {
284         HFileBlock cachedBlock =
285           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false, true);
286         if (cachedBlock != null) {
287           assert cachedBlock.isUnpacked() : "Packed block leak.";
288           // Return a distinct 'shallow copy' of the block,
289           // so pos does not get messed by the scanner
290           return cachedBlock.getBufferWithoutHeader();
291         }
292         // Cache Miss, please load.
293       }
294 
295       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
296           blockSize, -1, true).unpack(hfileContext, fsBlockReader);
297 
298       // Cache the block
299       if (cacheBlock) {
300         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
301             cacheConf.isInMemory());
302       }
303 
304       return metaBlock.getBufferWithoutHeader();
305     }
306   }
307 
308   /**
309    * Read in a file block.
310    * @param dataBlockOffset offset to read.
311    * @param onDiskBlockSize size of the block
312    * @param cacheBlock
313    * @param pread Use positional read instead of seek+read (positional is
314    *          better doing random reads whereas seek+read is better scanning).
315    * @param isCompaction is this block being read as part of a compaction
316    * @param expectedBlockType the block type we are expecting to read with this
317    *          read operation, or null to read whatever block type is available
318    *          and avoid checking (that might reduce caching efficiency of
319    *          encoded data blocks)
320    * @return Block wrapped in a ByteBuffer.
321    * @throws IOException
322    */
323   @Override
324   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
325       final boolean cacheBlock, boolean pread, final boolean isCompaction,
326       final boolean updateCacheMetrics, BlockType expectedBlockType)
327       throws IOException {
328     if (dataBlockIndexReader == null) {
329       throw new IOException("Block index not loaded");
330     }
331     if (dataBlockOffset < 0
332         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
333       throw new IOException("Requested block is out of range: "
334           + dataBlockOffset + ", lastDataBlockOffset: "
335           + trailer.getLastDataBlockOffset());
336     }
337     // For any given block from any given file, synchronize reads for said
338     // block.
339     // Without a cache, this synchronizing is needless overhead, but really
340     // the other choice is to duplicate work (which the cache would prevent you
341     // from doing).
342 
343     BlockCacheKey cacheKey =
344         new BlockCacheKey(name, dataBlockOffset,
345             dataBlockEncoder.getDataBlockEncoding(),
346             expectedBlockType);
347 
348     boolean useLock = false;
349     IdLock.Entry lockEntry = null;
350     TraceScope traceScope = Trace.startSpan("HFileReaderV2.readBlock");
351     try {
352       while (true) {
353         // Check cache for block. If found return.
354         if (cacheConf.shouldReadBlockFromCache(expectedBlockType)) {
355           if (useLock) {
356             lockEntry = offsetLock.getLockEntry(dataBlockOffset);
357           }
358           // Try and get the block from the block cache. If the useLock variable is true then this
359           // is the second time through the loop and it should not be counted as a block cache miss.
360           HFileBlock cachedBlock = (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, 
361             cacheBlock, useLock, updateCacheMetrics);
362           if (cachedBlock != null) {
363             if (cacheConf.shouldCacheCompressed(cachedBlock.getBlockType().getCategory())) {
364               cachedBlock = cachedBlock.unpack(hfileContext, fsBlockReader);
365             }
366             if (Trace.isTracing()) {
367               traceScope.getSpan().addTimelineAnnotation("blockCacheHit");
368             }
369             assert cachedBlock.isUnpacked() : "Packed block leak.";
370             if (cachedBlock.getBlockType().isData()) {
371               HFile.dataBlockReadCnt.incrementAndGet();
372 
373               // Validate encoding type for data blocks. We include encoding
374               // type in the cache key, and we expect it to match on a cache hit.
375               if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
376                 throw new IOException("Cached block under key " + cacheKey + " "
377                   + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
378                   + dataBlockEncoder.getDataBlockEncoding() + ")");
379               }
380             }
381             return cachedBlock;
382           }
383           if (!useLock && cacheBlock && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
384             // check cache again with lock
385             useLock = true;
386             continue;
387           }
388           // Carry on, please load.
389         }
390 
391         if (Trace.isTracing()) {
392           traceScope.getSpan().addTimelineAnnotation("blockCacheMiss");
393         }
394         // Load block from filesystem.
395         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, -1,
396             pread);
397         validateBlockType(hfileBlock, expectedBlockType);
398         HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
399         BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory();
400 
401         // Cache the block if necessary
402         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
403           cacheConf.getBlockCache().cacheBlock(cacheKey,
404             cacheConf.shouldCacheCompressed(category) ? hfileBlock : unpacked,
405             cacheConf.isInMemory());
406         }
407 
408         if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
409           HFile.dataBlockReadCnt.incrementAndGet();
410         }
411 
412         return unpacked;
413       }
414     } finally {
415       traceScope.close();
416       if (lockEntry != null) {
417         offsetLock.releaseLockEntry(lockEntry);
418       }
419     }
420   }
421 
422   @Override
423   public boolean hasMVCCInfo() {
424     return includesMemstoreTS && decodeMemstoreTS;
425   }
426 
427   /**
428    * Compares the actual type of a block retrieved from cache or disk with its
429    * expected type and throws an exception in case of a mismatch. Expected
430    * block type of {@link BlockType#DATA} is considered to match the actual
431    * block type [@link {@link BlockType#ENCODED_DATA} as well.
432    * @param block a block retrieved from cache or disk
433    * @param expectedBlockType the expected block type, or null to skip the
434    *          check
435    */
436   private void validateBlockType(HFileBlock block,
437       BlockType expectedBlockType) throws IOException {
438     if (expectedBlockType == null) {
439       return;
440     }
441     BlockType actualBlockType = block.getBlockType();
442     if (actualBlockType == BlockType.ENCODED_DATA &&
443         expectedBlockType == BlockType.DATA) {
444       // We consider DATA to match ENCODED_DATA for the purpose of this
445       // verification.
446       return;
447     }
448     if (actualBlockType != expectedBlockType) {
449       throw new IOException("Expected block type " + expectedBlockType + ", " +
450           "but got " + actualBlockType + ": " + block);
451     }
452   }
453 
454   /**
455    * @return Last key in the file. May be null if file has no entries. Note that
456    *         this is not the last row key, but rather the byte form of the last
457    *         KeyValue.
458    */
459   @Override
460   public byte[] getLastKey() {
461     return dataBlockIndexReader.isEmpty() ? null : lastKey;
462   }
463 
464   /**
465    * @return Midkey for this file. We work with block boundaries only so
466    *         returned midkey is an approximation only.
467    * @throws IOException
468    */
469   @Override
470   public byte[] midkey() throws IOException {
471     return dataBlockIndexReader.midkey();
472   }
473 
474   @Override
475   public void close() throws IOException {
476     close(cacheConf.shouldEvictOnClose());
477   }
478 
479   public void close(boolean evictOnClose) throws IOException {
480     PrefetchExecutor.cancel(path);
481     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
482       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
483       if (LOG.isTraceEnabled()) {
484         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
485           + " block(s)");
486       }
487     }
488     fsBlockReader.closeStreams();
489   }
490 
491   /** For testing */
492   @Override
493   HFileBlock.FSReader getUncachedBlockReader() {
494     return fsBlockReader;
495   }
496 
497 
498   protected abstract static class AbstractScannerV2
499       extends AbstractHFileReader.Scanner {
500     protected HFileBlock block;
501 
502     @Override
503     public byte[] getNextIndexedKey() {
504       return nextIndexedKey;
505     }
506     /**
507      * The next indexed key is to keep track of the indexed key of the next data block.
508      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
509      * current data block is the last data block.
510      *
511      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
512      */
513     protected byte[] nextIndexedKey;
514 
515     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
516         final boolean pread, final boolean isCompaction) {
517       super(r, cacheBlocks, pread, isCompaction);
518     }
519 
520     /**
521      * An internal API function. Seek to the given key, optionally rewinding to
522      * the first key of the block before doing the seek.
523      *
524      * @param key key byte array
525      * @param offset key offset in the key byte array
526      * @param length key length
527      * @param rewind whether to rewind to the first key of the block before
528      *        doing the seek. If this is false, we are assuming we never go
529      *        back, otherwise the result is undefined.
530      * @return -1 if the key is earlier than the first key of the file,
531      *         0 if we are at the given key, 1 if we are past the given key
532      *         -2 if the key is earlier than the first key of the file while
533      *         using a faked index key
534      * @throws IOException
535      */
536     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
537         throws IOException {
538       HFileBlockIndex.BlockIndexReader indexReader =
539           reader.getDataBlockIndexReader();
540       BlockWithScanInfo blockWithScanInfo =
541         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
542             cacheBlocks, pread, isCompaction);
543       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
544         // This happens if the key e.g. falls before the beginning of the file.
545         return -1;
546       }
547       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
548           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
549     }
550 
551     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
552 
553     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
554         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
555         throws IOException;
556 
557     @Override
558     public int seekTo(byte[] key, int offset, int length) throws IOException {
559       // Always rewind to the first key of the block, because the given key
560       // might be before or after the current key.
561       return seekTo(key, offset, length, true);
562     }
563 
564     @Override
565     public int reseekTo(byte[] key, int offset, int length) throws IOException {
566       int compared;
567       if (isSeeked()) {
568         compared = compareKey(reader.getComparator(), key, offset, length);
569         if (compared < 1) {
570           // If the required key is less than or equal to current key, then
571           // don't do anything.
572           return compared;
573         } else {
574           if (this.nextIndexedKey != null &&
575               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
576                reader.getComparator().compareFlatKey(key, offset, length,
577                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
578             // The reader shall continue to scan the current data block instead of querying the
579             // block index as long as it knows the target key is strictly smaller than
580             // the next indexed key or the current data block is the last data block.
581             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
582                 false, key, offset, length, false);
583           }
584         }
585       }
586       // Don't rewind on a reseek operation, because reseek implies that we are
587       // always going forward in the file.
588       return seekTo(key, offset, length, false);
589     }
590 
591     @Override
592     public boolean seekBefore(byte[] key, int offset, int length)
593         throws IOException {
594       HFileBlock seekToBlock =
595           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
596               block, cacheBlocks, pread, isCompaction);
597       if (seekToBlock == null) {
598         return false;
599       }
600       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
601 
602       if (reader.getComparator().compareFlatKey(firstKey.array(),
603           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) >= 0)
604       {
605         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
606         // The key we are interested in
607         if (previousBlockOffset == -1) {
608           // we have a 'problem', the key we want is the first of the file.
609           return false;
610         }
611 
612         // It is important that we compute and pass onDiskSize to the block
613         // reader so that it does not have to read the header separately to
614         // figure out the size.  Currently, we do not have a way to do this
615         // correctly in the general case however.
616         // TODO: See https://issues.apache.org/jira/browse/HBASE-14576
617         int prevBlockSize = -1;
618         seekToBlock = reader.readBlock(previousBlockOffset,
619             prevBlockSize, cacheBlocks,
620             pread, isCompaction, true, BlockType.DATA);
621         // TODO shortcut: seek forward in this block to the last key of the
622         // block.
623       }
624       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
625       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
626       return true;
627     }
628 
629 
630     /**
631      * Scans blocks in the "scanned" section of the {@link HFile} until the next
632      * data block is found.
633      *
634      * @return the next block, or null if there are no more data blocks
635      * @throws IOException
636      */
637     protected HFileBlock readNextDataBlock() throws IOException {
638       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
639       if (block == null)
640         return null;
641 
642       HFileBlock curBlock = block;
643 
644       do {
645         if (curBlock.getOffset() >= lastDataBlockOffset)
646           return null;
647 
648         if (curBlock.getOffset() < 0) {
649           throw new IOException("Invalid block file offset: " + block);
650         }
651 
652         // We are reading the next block without block type validation, because
653         // it might turn out to be a non-data block.
654         curBlock = reader.readBlock(curBlock.getOffset()
655             + curBlock.getOnDiskSizeWithHeader(),
656             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
657             isCompaction, true, null);
658       } while (!curBlock.getBlockType().isData());
659 
660       return curBlock;
661     }
662     /**
663      * Compare the given key against the current key
664      * @param comparator
665      * @param key
666      * @param offset
667      * @param length
668      * @return -1 is the passed key is smaller than the current key, 0 if equal and 1 if greater
669      */
670     public abstract int compareKey(KVComparator comparator, byte[] key, int offset,
671         int length);
672   }
673 
674   /**
675    * Implementation of {@link HFileScanner} interface.
676    */
677   protected static class ScannerV2 extends AbstractScannerV2 {
678     private HFileReaderV2 reader;
679 
680     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
681         final boolean pread, final boolean isCompaction) {
682       super(r, cacheBlocks, pread, isCompaction);
683       this.reader = r;
684     }
685 
686     @Override
687     public KeyValue getKeyValue() {
688       if (!isSeeked())
689         return null;
690 
691       // HFile V2 do not support tags.
692       return formNoTagsKeyValue();
693     }
694 
695     protected KeyValue formNoTagsKeyValue() {
696       KeyValue ret = new NoTagsKeyValue(blockBuffer.array(), blockBuffer.arrayOffset()
697           + blockBuffer.position(), getCellBufSize());
698       if (this.reader.shouldIncludeMemstoreTS()) {
699         ret.setMvccVersion(currMemstoreTS);
700       }
701       return ret;
702     }
703 
704     protected int getCellBufSize() {
705       return KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen;
706     }
707 
708     @Override
709     public ByteBuffer getKey() {
710       assertSeeked();
711       return ByteBuffer.wrap(
712           blockBuffer.array(),
713           blockBuffer.arrayOffset() + blockBuffer.position()
714               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
715     }
716 
717     @Override
718     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
719       return comparator.compareFlatKey(key, offset, length, blockBuffer.array(),
720           blockBuffer.arrayOffset() + blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen);
721     }
722 
723     @Override
724     public ByteBuffer getValue() {
725       assertSeeked();
726       return ByteBuffer.wrap(
727           blockBuffer.array(),
728           blockBuffer.arrayOffset() + blockBuffer.position()
729               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
730     }
731 
732     protected void setNonSeekedState() {
733       block = null;
734       blockBuffer = null;
735       currKeyLen = 0;
736       currValueLen = 0;
737       currMemstoreTS = 0;
738       currMemstoreTSLen = 0;
739     }
740 
741     /**
742      * Go to the next key/value in the block section. Loads the next block if
743      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
744      * be called.
745      *
746      * @return true if successfully navigated to the next key/value
747      */
748     @Override
749     public boolean next() throws IOException {
750       assertSeeked();
751 
752       try {
753         blockBuffer.position(getNextCellStartPosition());
754       } catch (IllegalArgumentException e) {
755         LOG.error("Current pos = " + blockBuffer.position()
756             + "; currKeyLen = " + currKeyLen + "; currValLen = "
757             + currValueLen + "; block limit = " + blockBuffer.limit()
758             + "; HFile name = " + reader.getName()
759             + "; currBlock currBlockOffset = " + block.getOffset());
760         throw e;
761       }
762 
763       if (blockBuffer.remaining() <= 0) {
764         long lastDataBlockOffset =
765             reader.getTrailer().getLastDataBlockOffset();
766 
767         if (block.getOffset() >= lastDataBlockOffset) {
768           setNonSeekedState();
769           return false;
770         }
771 
772         // read the next block
773         HFileBlock nextBlock = readNextDataBlock();
774         if (nextBlock == null) {
775           setNonSeekedState();
776           return false;
777         }
778 
779         updateCurrBlock(nextBlock);
780         return true;
781       }
782 
783       // We are still in the same block.
784       readKeyValueLen();
785       return true;
786     }
787 
788     protected int getNextCellStartPosition() {
789       return blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen
790           + currMemstoreTSLen;
791     }
792 
793     /**
794      * Positions this scanner at the start of the file.
795      *
796      * @return false if empty file; i.e. a call to next would return false and
797      *         the current key and value are undefined.
798      * @throws IOException
799      */
800     @Override
801     public boolean seekTo() throws IOException {
802       if (reader == null) {
803         return false;
804       }
805 
806       if (reader.getTrailer().getEntryCount() == 0) {
807         // No data blocks.
808         return false;
809       }
810 
811       long firstDataBlockOffset =
812           reader.getTrailer().getFirstDataBlockOffset();
813       if (block != null && block.getOffset() == firstDataBlockOffset) {
814         blockBuffer.rewind();
815         readKeyValueLen();
816         return true;
817       }
818 
819       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
820           isCompaction, true, BlockType.DATA);
821       if (block.getOffset() < 0) {
822         throw new IOException("Invalid block offset: " + block.getOffset());
823       }
824       updateCurrBlock(block);
825       return true;
826     }
827 
828     @Override
829     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
830         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
831         throws IOException {
832       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
833         updateCurrBlock(seekToBlock);
834       } else if (rewind) {
835         blockBuffer.rewind();
836       }
837 
838       // Update the nextIndexedKey
839       this.nextIndexedKey = nextIndexedKey;
840       return blockSeek(key, offset, length, seekBefore);
841     }
842 
843     /**
844      * Updates the current block to be the given {@link HFileBlock}. Seeks to
845      * the the first key/value pair.
846      *
847      * @param newBlock the block to make current
848      */
849     protected void updateCurrBlock(HFileBlock newBlock) {
850       block = newBlock;
851 
852       // sanity check
853       if (block.getBlockType() != BlockType.DATA) {
854         throw new IllegalStateException("ScannerV2 works only on data " +
855             "blocks, got " + block.getBlockType() + "; " +
856             "fileName=" + reader.name + ", " +
857             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
858             "isCompaction=" + isCompaction);
859       }
860 
861       blockBuffer = block.getBufferWithoutHeader();
862       readKeyValueLen();
863       blockFetches++;
864 
865       // Reset the next indexed key
866       this.nextIndexedKey = null;
867     }
868 
869     protected void readKeyValueLen() {
870       blockBuffer.mark();
871       currKeyLen = blockBuffer.getInt();
872       currValueLen = blockBuffer.getInt();
873       ByteBufferUtils.skip(blockBuffer, currKeyLen + currValueLen);
874       readMvccVersion();
875       if (currKeyLen < 0 || currValueLen < 0
876           || currKeyLen > blockBuffer.limit()
877           || currValueLen > blockBuffer.limit()) {
878         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
879             + " or currValueLen " + currValueLen + ". Block offset: "
880             + block.getOffset() + ", block length: " + blockBuffer.limit()
881             + ", position: " + blockBuffer.position() + " (without header).");
882       }
883       blockBuffer.reset();
884     }
885 
886     protected void readMvccVersion() {
887       if (this.reader.shouldIncludeMemstoreTS()) {
888         if (this.reader.decodeMemstoreTS) {
889           currMemstoreTS = Bytes.readAsVLong(blockBuffer.array(), blockBuffer.arrayOffset()
890               + blockBuffer.position());
891           currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
892         } else {
893           currMemstoreTS = 0;
894           currMemstoreTSLen = 1;
895         }
896       }
897     }
898 
899     /**
900      * Within a loaded block, seek looking for the last key that is smaller
901      * than (or equal to?) the key we are interested in.
902      *
903      * A note on the seekBefore: if you have seekBefore = true, AND the first
904      * key in the block = key, then you'll get thrown exceptions. The caller has
905      * to check for that case and load the previous block as appropriate.
906      *
907      * @param key the key to find
908      * @param seekBefore find the key before the given key in case of exact
909      *          match.
910      * @return 0 in case of an exact key match, 1 in case of an inexact match,
911      *         -2 in case of an inexact match and furthermore, the input key less
912      *         than the first key of current block(e.g. using a faked index key)
913      */
914     protected int blockSeek(byte[] key, int offset, int length,
915         boolean seekBefore) {
916       int klen, vlen;
917       long memstoreTS = 0;
918       int memstoreTSLen = 0;
919       int lastKeyValueSize = -1;
920       do {
921         blockBuffer.mark();
922         klen = blockBuffer.getInt();
923         vlen = blockBuffer.getInt();
924         blockBuffer.reset();
925         if (this.reader.shouldIncludeMemstoreTS()) {
926           if (this.reader.decodeMemstoreTS) {
927             int memstoreTSOffset = blockBuffer.arrayOffset() + blockBuffer.position()
928                 + KEY_VALUE_LEN_SIZE + klen + vlen;
929             memstoreTS = Bytes.readAsVLong(blockBuffer.array(), memstoreTSOffset);
930             memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
931           } else {
932             memstoreTS = 0;
933             memstoreTSLen = 1;
934           }
935         }
936 
937         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
938             + KEY_VALUE_LEN_SIZE;
939         int comp = reader.getComparator().compareFlatKey(key, offset, length,
940             blockBuffer.array(), keyOffset, klen);
941 
942         if (comp == 0) {
943           if (seekBefore) {
944             if (lastKeyValueSize < 0) {
945               throw new IllegalStateException("blockSeek with seekBefore "
946                   + "at the first key of the block: key="
947                   + Bytes.toStringBinary(key) + ", blockOffset="
948                   + block.getOffset() + ", onDiskSize="
949                   + block.getOnDiskSizeWithHeader());
950             }
951             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
952             readKeyValueLen();
953             return 1; // non exact match.
954           }
955           currKeyLen = klen;
956           currValueLen = vlen;
957           if (this.reader.shouldIncludeMemstoreTS()) {
958             currMemstoreTS = memstoreTS;
959             currMemstoreTSLen = memstoreTSLen;
960           }
961           return 0; // indicate exact match
962         } else if (comp < 0) {
963           if (lastKeyValueSize > 0)
964             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
965           readKeyValueLen();
966           if (lastKeyValueSize == -1 && blockBuffer.position() == 0
967               && this.reader.trailer.getMinorVersion() >= MINOR_VERSION_WITH_FAKED_KEY) {
968             return HConstants.INDEX_KEY_MAGIC;
969           }
970           return 1;
971         }
972 
973         // The size of this key/value tuple, including key/value length fields.
974         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
975         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
976       } while (blockBuffer.remaining() > 0);
977 
978       // Seek to the last key we successfully read. This will happen if this is
979       // the last key/value pair in the file, in which case the following call
980       // to next() has to return false.
981       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
982       readKeyValueLen();
983       return 1; // didn't exactly find it.
984     }
985 
986     @Override
987     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
988       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
989       // It is safe to manipulate this buffer because we own the buffer object.
990       buffer.rewind();
991       int klen = buffer.getInt();
992       buffer.getInt();
993       ByteBuffer keyBuff = buffer.slice();
994       keyBuff.limit(klen);
995       keyBuff.rewind();
996       return keyBuff;
997     }
998 
999     @Override
1000     public String getKeyString() {
1001       return Bytes.toStringBinary(blockBuffer.array(),
1002           blockBuffer.arrayOffset() + blockBuffer.position()
1003               + KEY_VALUE_LEN_SIZE, currKeyLen);
1004     }
1005 
1006     @Override
1007     public String getValueString() {
1008       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
1009           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
1010           currValueLen);
1011     }
1012   }
1013 
1014   /**
1015    * ScannerV2 that operates on encoded data blocks.
1016    */
1017   protected static class EncodedScannerV2 extends AbstractScannerV2 {
1018     private final HFileBlockDecodingContext decodingCtx;
1019     private final DataBlockEncoder.EncodedSeeker seeker;
1020     private final DataBlockEncoder dataBlockEncoder;
1021     protected final HFileContext meta;
1022 
1023     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
1024         boolean pread, boolean isCompaction, HFileContext meta) {
1025       super(reader, cacheBlocks, pread, isCompaction);
1026       DataBlockEncoding encoding = reader.dataBlockEncoder.getDataBlockEncoding();
1027       dataBlockEncoder = encoding.getEncoder();
1028       decodingCtx = dataBlockEncoder.newDataBlockDecodingContext(meta);
1029       seeker = dataBlockEncoder.createSeeker(
1030         reader.getComparator(), decodingCtx);
1031       this.meta = meta;
1032     }
1033 
1034     @Override
1035     public boolean isSeeked(){
1036       return this.block != null;
1037     }
1038 
1039     /**
1040      * Updates the current block to be the given {@link HFileBlock}. Seeks to
1041      * the the first key/value pair.
1042      *
1043      * @param newBlock the block to make current
1044      * @throws CorruptHFileException
1045      */
1046     private void updateCurrentBlock(HFileBlock newBlock) throws CorruptHFileException {
1047       block = newBlock;
1048 
1049       // sanity checks
1050       if (block.getBlockType() != BlockType.ENCODED_DATA) {
1051         throw new IllegalStateException(
1052             "EncodedScanner works only on encoded data blocks");
1053       }
1054       short dataBlockEncoderId = block.getDataBlockEncodingId();
1055       if (!DataBlockEncoding.isCorrectEncoder(dataBlockEncoder, dataBlockEncoderId)) {
1056         String encoderCls = dataBlockEncoder.getClass().getName();
1057         throw new CorruptHFileException("Encoder " + encoderCls
1058           + " doesn't support data block encoding "
1059           + DataBlockEncoding.getNameFromId(dataBlockEncoderId));
1060       }
1061 
1062       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
1063       blockFetches++;
1064 
1065       // Reset the next indexed key
1066       this.nextIndexedKey = null;
1067     }
1068 
1069     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
1070       ByteBuffer origBlock = newBlock.getBufferReadOnly();
1071       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
1072           origBlock.arrayOffset() + newBlock.headerSize() +
1073           DataBlockEncoding.ID_SIZE,
1074           newBlock.getUncompressedSizeWithoutHeader() -
1075           DataBlockEncoding.ID_SIZE).slice();
1076       return encodedBlock;
1077     }
1078 
1079     @Override
1080     public boolean seekTo() throws IOException {
1081       if (reader == null) {
1082         return false;
1083       }
1084 
1085       if (reader.getTrailer().getEntryCount() == 0) {
1086         // No data blocks.
1087         return false;
1088       }
1089 
1090       long firstDataBlockOffset =
1091           reader.getTrailer().getFirstDataBlockOffset();
1092       if (block != null && block.getOffset() == firstDataBlockOffset) {
1093         seeker.rewind();
1094         return true;
1095       }
1096 
1097       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1098           isCompaction, true, BlockType.DATA);
1099       if (block.getOffset() < 0) {
1100         throw new IOException("Invalid block offset: " + block.getOffset());
1101       }
1102       updateCurrentBlock(block);
1103       return true;
1104     }
1105 
1106     @Override
1107     public boolean next() throws IOException {
1108       boolean isValid = seeker.next();
1109       if (!isValid) {
1110         block = readNextDataBlock();
1111         isValid = block != null;
1112         if (isValid) {
1113           updateCurrentBlock(block);
1114         }
1115       }
1116       return isValid;
1117     }
1118 
1119     @Override
1120     public ByteBuffer getKey() {
1121       assertValidSeek();
1122       return seeker.getKeyDeepCopy();
1123     }
1124 
1125     @Override
1126     public int compareKey(KVComparator comparator, byte[] key, int offset, int length) {
1127       return seeker.compareKey(comparator, key, offset, length);
1128     }
1129 
1130     @Override
1131     public ByteBuffer getValue() {
1132       assertValidSeek();
1133       return seeker.getValueShallowCopy();
1134     }
1135 
1136     @Override
1137     public KeyValue getKeyValue() {
1138       if (block == null) {
1139         return null;
1140       }
1141       return seeker.getKeyValue();
1142     }
1143 
1144     @Override
1145     public String getKeyString() {
1146       ByteBuffer keyBuffer = getKey();
1147       return Bytes.toStringBinary(keyBuffer.array(),
1148           keyBuffer.arrayOffset(), keyBuffer.limit());
1149     }
1150 
1151     @Override
1152     public String getValueString() {
1153       ByteBuffer valueBuffer = getValue();
1154       return Bytes.toStringBinary(valueBuffer.array(),
1155           valueBuffer.arrayOffset(), valueBuffer.limit());
1156     }
1157 
1158     private void assertValidSeek() {
1159       if (block == null) {
1160         throw new NotSeekedException();
1161       }
1162     }
1163 
1164     @Override
1165     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1166       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1167     }
1168 
1169     @Override
1170     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1171         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1172         throws IOException  {
1173       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1174         updateCurrentBlock(seekToBlock);
1175       } else if (rewind) {
1176         seeker.rewind();
1177       }
1178       this.nextIndexedKey = nextIndexedKey;
1179       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1180     }
1181   }
1182 
1183   /**
1184    * Returns a buffer with the Bloom filter metadata. The caller takes
1185    * ownership of the buffer.
1186    */
1187   @Override
1188   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1189     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1190   }
1191 
1192   @Override
1193   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1194     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1195   }
1196 
1197   private DataInput getBloomFilterMetadata(BlockType blockType)
1198   throws IOException {
1199     if (blockType != BlockType.GENERAL_BLOOM_META &&
1200         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1201       throw new RuntimeException("Block Type: " + blockType.toString() +
1202           " is not supported") ;
1203     }
1204 
1205     for (HFileBlock b : loadOnOpenBlocks)
1206       if (b.getBlockType() == blockType)
1207         return b.getByteStream();
1208     return null;
1209   }
1210 
1211   @Override
1212   public boolean isFileInfoLoaded() {
1213     return true; // We load file info in constructor in version 2.
1214   }
1215 
1216   /**
1217    * Validates that the minor version is within acceptable limits.
1218    * Otherwise throws an Runtime exception
1219    */
1220   private void validateMinorVersion(Path path, int minorVersion) {
1221     if (minorVersion < MIN_MINOR_VERSION ||
1222         minorVersion > MAX_MINOR_VERSION) {
1223       String msg = "Minor version for path " + path + 
1224                    " is expected to be between " +
1225                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1226                    " but is found to be " + minorVersion;
1227       LOG.error(msg);
1228       throw new RuntimeException(msg);
1229     }
1230   }
1231 
1232   @Override
1233   public int getMajorVersion() {
1234     return 2;
1235   }
1236 
1237   @Override
1238   public HFileContext getFileContext() {
1239     return hfileContext;
1240   }
1241 
1242   /**
1243    * Returns false if block prefetching was requested for this file and has
1244    * not completed, true otherwise
1245    */
1246   @VisibleForTesting
1247   boolean prefetchComplete() {
1248     return PrefetchExecutor.isCompleted(path);
1249   }
1250 }