View Javadoc

1   /*
2    * Copyright 2011 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.io.hfile;
21  
22  import java.io.DataInput;
23  import java.io.IOException;
24  import java.nio.ByteBuffer;
25  import java.util.ArrayList;
26  import java.util.List;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.fs.FSDataInputStream;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.KeyValue;
34  import org.apache.hadoop.hbase.fs.HFileSystem;
35  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
36  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
37  import org.apache.hadoop.hbase.io.hfile.BlockType.BlockCategory;
38  import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.hbase.util.IdLock;
41  import org.apache.hadoop.io.WritableUtils;
42  
43  /**
44   * {@link HFile} reader for version 2.
45   */
46  public class HFileReaderV2 extends AbstractHFileReader {
47  
48    private static final Log LOG = LogFactory.getLog(HFileReaderV2.class);
49  
50    /**
51     * The size of a (key length, value length) tuple that prefixes each entry in
52     * a data block.
53     */
54    private static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
55  
56    private boolean includesMemstoreTS = false;
57    private boolean decodeMemstoreTS = false;
58  
59    private boolean shouldIncludeMemstoreTS() {
60      return includesMemstoreTS;
61    }
62  
63    /**
64     * A "sparse lock" implementation allowing to lock on a particular block
65     * identified by offset. The purpose of this is to avoid two clients loading
66     * the same block, and have all but one client wait to get the block from the
67     * cache.
68     */
69    private IdLock offsetLock = new IdLock();
70  
71    /**
72     * Blocks read from the load-on-open section, excluding data root index, meta
73     * index, and file info.
74     */
75    private List<HFileBlock> loadOnOpenBlocks = new ArrayList<HFileBlock>();
76  
77    /** Minimum minor version supported by this HFile format */
78    static final int MIN_MINOR_VERSION = 0;
79  
80    /** Maximum minor version supported by this HFile format */
81    static final int MAX_MINOR_VERSION = 1;
82  
83    /**
84     * Opens a HFile. You must load the index before you can use it by calling
85     * {@link #loadFileInfo()}.
86     *
87     * @param path Path to HFile.
88     * @param trailer File trailer.
89     * @param fsdis input stream. Caller is responsible for closing the passed
90     *          stream.
91     * @param size Length of the stream.
92     * @param closeIStream Whether to close the stream.
93     * @param cacheConf Cache configuration.
94     * @param preferredEncodingInCache the encoding to use in cache in case we
95     *          have a choice. If the file is already encoded on disk, we will
96     *          still use its on-disk encoding in cache.
97     */
98    public HFileReaderV2(Path path, FixedFileTrailer trailer,
99        final FSDataInputStream fsdis, final FSDataInputStream fsdisNoFsChecksum,
100       final long size,
101       final boolean closeIStream, final CacheConfig cacheConf,
102       DataBlockEncoding preferredEncodingInCache, final HFileSystem hfs)
103       throws IOException {
104     super(path, trailer, fsdis, fsdisNoFsChecksum, size, 
105           closeIStream, cacheConf, hfs);
106     trailer.expectMajorVersion(2);
107     validateMinorVersion(path, trailer.getMinorVersion());
108     HFileBlock.FSReaderV2 fsBlockReaderV2 = new HFileBlock.FSReaderV2(fsdis,
109         fsdisNoFsChecksum,
110         compressAlgo, fileSize, trailer.getMinorVersion(), hfs, path);
111     this.fsBlockReader = fsBlockReaderV2; // upcast
112 
113     // Comparator class name is stored in the trailer in version 2.
114     comparator = trailer.createComparator();
115     dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(comparator,
116         trailer.getNumDataIndexLevels(), this);
117     metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
118         Bytes.BYTES_RAWCOMPARATOR, 1);
119 
120     // Parse load-on-open data.
121 
122     HFileBlock.BlockIterator blockIter = fsBlockReaderV2.blockRange(
123         trailer.getLoadOnOpenDataOffset(),
124         fileSize - trailer.getTrailerSize());
125 
126     // Data index. We also read statistics about the block index written after
127     // the root level.
128     dataBlockIndexReader.readMultiLevelIndexRoot(
129         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
130         trailer.getDataIndexCount());
131 
132     // Meta index.
133     metaBlockIndexReader.readRootIndex(
134         blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
135         trailer.getMetaIndexCount());
136 
137     // File info
138     fileInfo = new FileInfo();
139     fileInfo.readFields(blockIter.nextBlockWithBlockType(BlockType.FILE_INFO).getByteStream());
140     lastKey = fileInfo.get(FileInfo.LASTKEY);
141     avgKeyLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_KEY_LEN));
142     avgValueLen = Bytes.toInt(fileInfo.get(FileInfo.AVG_VALUE_LEN));
143     byte [] keyValueFormatVersion =
144         fileInfo.get(HFileWriterV2.KEY_VALUE_VERSION);
145     includesMemstoreTS = keyValueFormatVersion != null &&
146         Bytes.toInt(keyValueFormatVersion) ==
147             HFileWriterV2.KEY_VALUE_VER_WITH_MEMSTORE;
148     fsBlockReaderV2.setIncludesMemstoreTS(includesMemstoreTS);
149     if (includesMemstoreTS) {
150       decodeMemstoreTS = Bytes.toLong(fileInfo.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY)) > 0;
151     }
152 
153     // Read data block encoding algorithm name from file info.
154     dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo,
155         preferredEncodingInCache);
156     fsBlockReaderV2.setDataBlockEncoder(dataBlockEncoder);
157 
158     // Store all other load-on-open blocks for further consumption.
159     HFileBlock b;
160     while ((b = blockIter.nextBlock()) != null) {
161       loadOnOpenBlocks.add(b);
162     }
163   }
164 
165   /**
166    * Create a Scanner on this file. No seeks or reads are done on creation. Call
167    * {@link HFileScanner#seekTo(byte[])} to position an start the read. There is
168    * nothing to clean up in a Scanner. Letting go of your references to the
169    * scanner is sufficient.
170    *
171    * @param cacheBlocks True if we should cache blocks read in by this scanner.
172    * @param pread Use positional read rather than seek+read if true (pread is
173    *          better for random reads, seek+read is better scanning).
174    * @param isCompaction is scanner being used for a compaction?
175    * @return Scanner on this file.
176    */
177    @Override
178    public HFileScanner getScanner(boolean cacheBlocks, final boolean pread,
179       final boolean isCompaction) {
180     // check if we want to use data block encoding in memory
181     if (dataBlockEncoder.useEncodedScanner(isCompaction)) {
182       return new EncodedScannerV2(this, cacheBlocks, pread, isCompaction,
183           includesMemstoreTS);
184     }
185 
186     return new ScannerV2(this, cacheBlocks, pread, isCompaction);
187   }
188 
189   /**
190    * @param metaBlockName
191    * @param cacheBlock Add block to cache, if found
192    * @return block wrapped in a ByteBuffer, with header skipped
193    * @throws IOException
194    */
195   @Override
196   public ByteBuffer getMetaBlock(String metaBlockName, boolean cacheBlock)
197       throws IOException {
198     if (trailer.getMetaIndexCount() == 0) {
199       return null; // there are no meta blocks
200     }
201     if (metaBlockIndexReader == null) {
202       throw new IOException("Meta index not loaded");
203     }
204 
205     byte[] mbname = Bytes.toBytes(metaBlockName);
206     int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0,
207         mbname.length);
208     if (block == -1)
209       return null;
210     long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
211     long startTimeNs = System.nanoTime();
212 
213     // Per meta key from any given file, synchronize reads for said block. This
214     // is OK to do for meta blocks because the meta block index is always
215     // single-level.
216     synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
217       // Check cache for block. If found return.
218       long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
219       BlockCacheKey cacheKey = new BlockCacheKey(name, metaBlockOffset,
220           DataBlockEncoding.NONE, BlockType.META);
221 
222       cacheBlock &= cacheConf.shouldCacheDataOnRead();
223       if (cacheConf.isBlockCacheEnabled()) {
224         HFileBlock cachedBlock =
225           (HFileBlock) cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, false);
226         if (cachedBlock != null) {
227           // Return a distinct 'shallow copy' of the block,
228           // so pos does not get messed by the scanner
229           getSchemaMetrics().updateOnCacheHit(BlockCategory.META, false);
230           return cachedBlock.getBufferWithoutHeader();
231         }
232         // Cache Miss, please load.
233       }
234 
235       HFileBlock metaBlock = fsBlockReader.readBlockData(metaBlockOffset,
236           blockSize, -1, true);
237       passSchemaMetricsTo(metaBlock);
238 
239       final long delta = System.nanoTime() - startTimeNs;
240       HFile.offerReadLatency(delta, true);
241       getSchemaMetrics().updateOnCacheMiss(BlockCategory.META, false, delta);
242 
243       // Cache the block
244       if (cacheBlock) {
245         cacheConf.getBlockCache().cacheBlock(cacheKey, metaBlock,
246             cacheConf.isInMemory());
247       }
248 
249       return metaBlock.getBufferWithoutHeader();
250     }
251   }
252 
253   /**
254    * Read in a file block.
255    * @param dataBlockOffset offset to read.
256    * @param onDiskBlockSize size of the block
257    * @param cacheBlock
258    * @param pread Use positional read instead of seek+read (positional is
259    *          better doing random reads whereas seek+read is better scanning).
260    * @param isCompaction is this block being read as part of a compaction
261    * @param expectedBlockType the block type we are expecting to read with this
262    *          read operation, or null to read whatever block type is available
263    *          and avoid checking (that might reduce caching efficiency of
264    *          encoded data blocks)
265    * @return Block wrapped in a ByteBuffer.
266    * @throws IOException
267    */
268   @Override
269   public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize,
270       final boolean cacheBlock, boolean pread, final boolean isCompaction,
271       BlockType expectedBlockType)
272       throws IOException {
273     if (dataBlockIndexReader == null) {
274       throw new IOException("Block index not loaded");
275     }
276     if (dataBlockOffset < 0
277         || dataBlockOffset >= trailer.getLoadOnOpenDataOffset()) {
278       throw new IOException("Requested block is out of range: "
279           + dataBlockOffset + ", lastDataBlockOffset: "
280           + trailer.getLastDataBlockOffset());
281     }
282     // For any given block from any given file, synchronize reads for said
283     // block.
284     // Without a cache, this synchronizing is needless overhead, but really
285     // the other choice is to duplicate work (which the cache would prevent you
286     // from doing).
287 
288     BlockCacheKey cacheKey =
289         new BlockCacheKey(name, dataBlockOffset,
290             dataBlockEncoder.getEffectiveEncodingInCache(isCompaction),
291             expectedBlockType);
292 
293     boolean useLock = false;
294     IdLock.Entry lockEntry = null;
295 
296     try {
297       while (true) {
298 
299         if (useLock) {
300           lockEntry = offsetLock.getLockEntry(dataBlockOffset);
301         }
302 
303         // Check cache for block. If found return.
304         if (cacheConf.isBlockCacheEnabled()) {
305           // Try and get the block from the block cache.  If the useLock variable is true then this
306           // is the second time through the loop and it should not be counted as a block cache miss.
307           HFileBlock cachedBlock = (HFileBlock)
308               cacheConf.getBlockCache().getBlock(cacheKey, cacheBlock, useLock);
309           if (cachedBlock != null) {
310             BlockCategory blockCategory =
311                 cachedBlock.getBlockType().getCategory();
312 
313             getSchemaMetrics().updateOnCacheHit(blockCategory, isCompaction);
314 
315             if (cachedBlock.getBlockType() == BlockType.DATA) {
316               HFile.dataBlockReadCnt.incrementAndGet();
317             }
318 
319             validateBlockType(cachedBlock, expectedBlockType);
320 
321             // Validate encoding type for encoded blocks. We include encoding
322             // type in the cache key, and we expect it to match on a cache hit.
323             if (cachedBlock.getBlockType() == BlockType.ENCODED_DATA &&
324                 cachedBlock.getDataBlockEncoding() !=
325                     dataBlockEncoder.getEncodingInCache()) {
326               throw new IOException("Cached block under key " + cacheKey + " " +
327                   "has wrong encoding: " + cachedBlock.getDataBlockEncoding() +
328                   " (expected: " + dataBlockEncoder.getEncodingInCache() + ")");
329             }
330             return cachedBlock;
331           }
332           // Carry on, please load.
333         }
334         if (!useLock) {
335           // check cache again with lock
336           useLock = true;
337           continue;
338         }
339 
340         // Load block from filesystem.
341         long startTimeNs = System.nanoTime();
342         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset,
343             onDiskBlockSize, -1, pread);
344         hfileBlock = dataBlockEncoder.diskToCacheFormat(hfileBlock,
345             isCompaction);
346         validateBlockType(hfileBlock, expectedBlockType);
347         passSchemaMetricsTo(hfileBlock);
348         BlockCategory blockCategory = hfileBlock.getBlockType().getCategory();
349 
350         final long delta = System.nanoTime() - startTimeNs;
351         HFile.offerReadLatency(delta, pread);
352         getSchemaMetrics().updateOnCacheMiss(blockCategory, isCompaction, delta);
353 
354         // Cache the block if necessary
355         if (cacheBlock && cacheConf.shouldCacheBlockOnRead(
356             hfileBlock.getBlockType().getCategory())) {
357           cacheConf.getBlockCache().cacheBlock(cacheKey, hfileBlock,
358               cacheConf.isInMemory());
359         }
360 
361         if (hfileBlock.getBlockType() == BlockType.DATA) {
362           HFile.dataBlockReadCnt.incrementAndGet();
363         }
364 
365         return hfileBlock;
366       }
367     } finally {
368       if (lockEntry != null) {
369         offsetLock.releaseLockEntry(lockEntry);
370       }
371     }
372   }
373 
374   /**
375    * Compares the actual type of a block retrieved from cache or disk with its
376    * expected type and throws an exception in case of a mismatch. Expected
377    * block type of {@link BlockType#DATA} is considered to match the actual
378    * block type [@link {@link BlockType#ENCODED_DATA} as well.
379    * @param block a block retrieved from cache or disk
380    * @param expectedBlockType the expected block type, or null to skip the
381    *          check
382    */
383   private void validateBlockType(HFileBlock block,
384       BlockType expectedBlockType) throws IOException {
385     if (expectedBlockType == null) {
386       return;
387     }
388     BlockType actualBlockType = block.getBlockType();
389     if (actualBlockType == BlockType.ENCODED_DATA &&
390         expectedBlockType == BlockType.DATA) {
391       // We consider DATA to match ENCODED_DATA for the purpose of this
392       // verification.
393       return;
394     }
395     if (actualBlockType != expectedBlockType) {
396       throw new IOException("Expected block type " + expectedBlockType + ", " +
397           "but got " + actualBlockType + ": " + block);
398     }
399   }
400 
401   /**
402    * @return Last key in the file. May be null if file has no entries. Note that
403    *         this is not the last row key, but rather the byte form of the last
404    *         KeyValue.
405    */
406   @Override
407   public byte[] getLastKey() {
408     return dataBlockIndexReader.isEmpty() ? null : lastKey;
409   }
410 
411   /**
412    * @return Midkey for this file. We work with block boundaries only so
413    *         returned midkey is an approximation only.
414    * @throws IOException
415    */
416   @Override
417   public byte[] midkey() throws IOException {
418     return dataBlockIndexReader.midkey();
419   }
420 
421   @Override
422   public void close() throws IOException {
423     close(cacheConf.shouldEvictOnClose());
424   }
425 
426   public void close(boolean evictOnClose) throws IOException {
427     if (evictOnClose && cacheConf.isBlockCacheEnabled()) {
428       int numEvicted = cacheConf.getBlockCache().evictBlocksByHfileName(name);
429       if (LOG.isTraceEnabled()) {
430         LOG.trace("On close, file=" + name + " evicted=" + numEvicted
431           + " block(s)");
432       }
433     }
434     if (closeIStream) {
435       if (istream != istreamNoFsChecksum && istreamNoFsChecksum != null) {
436         istreamNoFsChecksum.close();
437         istreamNoFsChecksum = null;
438       }
439       if (istream != null) {
440         istream.close();
441         istream = null;
442       }
443     }
444 
445     getSchemaMetrics().flushMetrics();
446   }
447 
448   protected abstract static class AbstractScannerV2
449       extends AbstractHFileReader.Scanner {
450     protected HFileBlock block;
451 
452     /**
453      * The next indexed key is to keep track of the indexed key of the next data block.
454      * If the nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the
455      * current data block is the last data block.
456      *
457      * If the nextIndexedKey is null, it means the nextIndexedKey has not been loaded yet.
458      */
459     protected byte[] nextIndexedKey;
460 
461     public AbstractScannerV2(HFileReaderV2 r, boolean cacheBlocks,
462         final boolean pread, final boolean isCompaction) {
463       super(r, cacheBlocks, pread, isCompaction);
464     }
465 
466     /**
467      * An internal API function. Seek to the given key, optionally rewinding to
468      * the first key of the block before doing the seek.
469      *
470      * @param key key byte array
471      * @param offset key offset in the key byte array
472      * @param length key length
473      * @param rewind whether to rewind to the first key of the block before
474      *        doing the seek. If this is false, we are assuming we never go
475      *        back, otherwise the result is undefined.
476      * @return -1 if the key is earlier than the first key of the file,
477      *         0 if we are at the given key, and 1 if we are past the given key
478      * @throws IOException
479      */
480     protected int seekTo(byte[] key, int offset, int length, boolean rewind)
481         throws IOException {
482       HFileBlockIndex.BlockIndexReader indexReader =
483           reader.getDataBlockIndexReader();
484       BlockWithScanInfo blockWithScanInfo =
485         indexReader.loadDataBlockWithScanInfo(key, offset, length, block,
486             cacheBlocks, pread, isCompaction);
487       if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
488         // This happens if the key e.g. falls before the beginning of the file.
489         return -1;
490       }
491       return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
492           blockWithScanInfo.getNextIndexedKey(), rewind, key, offset, length, false);
493     }
494 
495     protected abstract ByteBuffer getFirstKeyInBlock(HFileBlock curBlock);
496 
497     protected abstract int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
498         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
499         throws IOException;
500 
501     @Override
502     public int seekTo(byte[] key, int offset, int length) throws IOException {
503       // Always rewind to the first key of the block, because the given key
504       // might be before or after the current key.
505       return seekTo(key, offset, length, true);
506     }
507 
508     @Override
509     public int reseekTo(byte[] key, int offset, int length) throws IOException {
510       int compared;
511       if (isSeeked()) {
512         ByteBuffer bb = getKey();
513         compared = reader.getComparator().compare(key, offset,
514             length, bb.array(), bb.arrayOffset(), bb.limit());
515         if (compared < 1) {
516           // If the required key is less than or equal to current key, then
517           // don't do anything.
518           return compared;
519         } else {
520           if (this.nextIndexedKey != null &&
521               (this.nextIndexedKey == HConstants.NO_NEXT_INDEXED_KEY ||
522                reader.getComparator().compare(key, offset, length,
523                    nextIndexedKey, 0, nextIndexedKey.length) < 0)) {
524             // The reader shall continue to scan the current data block instead of querying the
525             // block index as long as it knows the target key is strictly smaller than
526             // the next indexed key or the current data block is the last data block.
527             return loadBlockAndSeekToKey(this.block, this.nextIndexedKey,
528                 false, key, offset, length, false);
529           }
530         }
531       }
532       // Don't rewind on a reseek operation, because reseek implies that we are
533       // always going forward in the file.
534       return seekTo(key, offset, length, false);
535     }
536 
537     @Override
538     public boolean seekBefore(byte[] key, int offset, int length)
539         throws IOException {
540       HFileBlock seekToBlock =
541           reader.getDataBlockIndexReader().seekToDataBlock(key, offset, length,
542               block, cacheBlocks, pread, isCompaction);
543       if (seekToBlock == null) {
544         return false;
545       }
546       ByteBuffer firstKey = getFirstKeyInBlock(seekToBlock);
547 
548       if (reader.getComparator().compare(firstKey.array(),
549           firstKey.arrayOffset(), firstKey.limit(), key, offset, length) == 0)
550       {
551         long previousBlockOffset = seekToBlock.getPrevBlockOffset();
552         // The key we are interested in
553         if (previousBlockOffset == -1) {
554           // we have a 'problem', the key we want is the first of the file.
555           return false;
556         }
557 
558         // It is important that we compute and pass onDiskSize to the block
559         // reader so that it does not have to read the header separately to
560         // figure out the size.
561         seekToBlock = reader.readBlock(previousBlockOffset,
562             seekToBlock.getOffset() - previousBlockOffset, cacheBlocks,
563             pread, isCompaction, BlockType.DATA);
564         // TODO shortcut: seek forward in this block to the last key of the
565         // block.
566       }
567       byte[] firstKeyInCurrentBlock = Bytes.getBytes(firstKey);
568       loadBlockAndSeekToKey(seekToBlock, firstKeyInCurrentBlock, true, key, offset, length, true);
569       return true;
570     }
571 
572 
573     /**
574      * Scans blocks in the "scanned" section of the {@link HFile} until the next
575      * data block is found.
576      *
577      * @return the next block, or null if there are no more data blocks
578      * @throws IOException
579      */
580     protected HFileBlock readNextDataBlock() throws IOException {
581       long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
582       if (block == null)
583         return null;
584 
585       HFileBlock curBlock = block;
586 
587       do {
588         if (curBlock.getOffset() >= lastDataBlockOffset)
589           return null;
590 
591         if (curBlock.getOffset() < 0) {
592           throw new IOException("Invalid block file offset: " + block);
593         }
594 
595         // We are reading the next block without block type validation, because
596         // it might turn out to be a non-data block.
597         curBlock = reader.readBlock(curBlock.getOffset()
598             + curBlock.getOnDiskSizeWithHeader(),
599             curBlock.getNextBlockOnDiskSizeWithHeader(), cacheBlocks, pread,
600             isCompaction, null);
601       } while (!(curBlock.getBlockType().equals(BlockType.DATA) ||
602           curBlock.getBlockType().equals(BlockType.ENCODED_DATA)));
603 
604       return curBlock;
605     }
606   }
607 
608   /**
609    * Implementation of {@link HFileScanner} interface.
610    */
611   protected static class ScannerV2 extends AbstractScannerV2 {
612     private HFileReaderV2 reader;
613 
614     public ScannerV2(HFileReaderV2 r, boolean cacheBlocks,
615         final boolean pread, final boolean isCompaction) {
616       super(r, cacheBlocks, pread, isCompaction);
617       this.reader = r;
618     }
619 
620     @Override
621     public KeyValue getKeyValue() {
622       if (!isSeeked())
623         return null;
624 
625       KeyValue ret = new KeyValue(blockBuffer.array(),
626           blockBuffer.arrayOffset() + blockBuffer.position(),
627           KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen,
628           currKeyLen);
629       if (this.reader.shouldIncludeMemstoreTS()) {
630         ret.setMemstoreTS(currMemstoreTS);
631       }
632       return ret;
633     }
634 
635     @Override
636     public ByteBuffer getKey() {
637       assertSeeked();
638       return ByteBuffer.wrap(
639           blockBuffer.array(),
640           blockBuffer.arrayOffset() + blockBuffer.position()
641               + KEY_VALUE_LEN_SIZE, currKeyLen).slice();
642     }
643 
644     @Override
645     public ByteBuffer getValue() {
646       assertSeeked();
647       return ByteBuffer.wrap(
648           blockBuffer.array(),
649           blockBuffer.arrayOffset() + blockBuffer.position()
650               + KEY_VALUE_LEN_SIZE + currKeyLen, currValueLen).slice();
651     }
652 
653     private void setNonSeekedState() {
654       block = null;
655       blockBuffer = null;
656       currKeyLen = 0;
657       currValueLen = 0;
658       currMemstoreTS = 0;
659       currMemstoreTSLen = 0;
660     }
661 
662     /**
663      * Go to the next key/value in the block section. Loads the next block if
664      * necessary. If successful, {@link #getKey()} and {@link #getValue()} can
665      * be called.
666      *
667      * @return true if successfully navigated to the next key/value
668      */
669     @Override
670     public boolean next() throws IOException {
671       assertSeeked();
672 
673       try {
674         blockBuffer.position(blockBuffer.position() + KEY_VALUE_LEN_SIZE
675             + currKeyLen + currValueLen + currMemstoreTSLen);
676       } catch (IllegalArgumentException e) {
677         LOG.error("Current pos = " + blockBuffer.position()
678             + "; currKeyLen = " + currKeyLen + "; currValLen = "
679             + currValueLen + "; block limit = " + blockBuffer.limit()
680             + "; HFile name = " + reader.getName()
681             + "; currBlock currBlockOffset = " + block.getOffset());
682         throw e;
683       }
684 
685       if (blockBuffer.remaining() <= 0) {
686         long lastDataBlockOffset =
687             reader.getTrailer().getLastDataBlockOffset();
688 
689         if (block.getOffset() >= lastDataBlockOffset) {
690           setNonSeekedState();
691           return false;
692         }
693 
694         // read the next block
695         HFileBlock nextBlock = readNextDataBlock();
696         if (nextBlock == null) {
697           setNonSeekedState();
698           return false;
699         }
700 
701         updateCurrBlock(nextBlock);
702         return true;
703       }
704 
705       // We are still in the same block.
706       readKeyValueLen();
707       return true;
708     }
709 
710     /**
711      * Positions this scanner at the start of the file.
712      *
713      * @return false if empty file; i.e. a call to next would return false and
714      *         the current key and value are undefined.
715      * @throws IOException
716      */
717     @Override
718     public boolean seekTo() throws IOException {
719       if (reader == null) {
720         return false;
721       }
722 
723       if (reader.getTrailer().getEntryCount() == 0) {
724         // No data blocks.
725         return false;
726       }
727 
728       long firstDataBlockOffset =
729           reader.getTrailer().getFirstDataBlockOffset();
730       if (block != null && block.getOffset() == firstDataBlockOffset) {
731         blockBuffer.rewind();
732         readKeyValueLen();
733         return true;
734       }
735 
736       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
737           isCompaction, BlockType.DATA);
738       if (block.getOffset() < 0) {
739         throw new IOException("Invalid block offset: " + block.getOffset());
740       }
741       updateCurrBlock(block);
742       return true;
743     }
744 
745     @Override
746     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
747         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
748         throws IOException {
749       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
750         updateCurrBlock(seekToBlock);
751       } else if (rewind) {
752         blockBuffer.rewind();
753       }
754 
755       // Update the nextIndexedKey
756       this.nextIndexedKey = nextIndexedKey;
757       return blockSeek(key, offset, length, seekBefore);
758     }
759 
760     /**
761      * Updates the current block to be the given {@link HFileBlock}. Seeks to
762      * the the first key/value pair.
763      *
764      * @param newBlock the block to make current
765      */
766     private void updateCurrBlock(HFileBlock newBlock) {
767       block = newBlock;
768 
769       // sanity check
770       if (block.getBlockType() != BlockType.DATA) {
771         throw new IllegalStateException("ScannerV2 works only on data " +
772             "blocks, got " + block.getBlockType() + "; " +
773             "fileName=" + reader.name + ", " +
774             "dataBlockEncoder=" + reader.dataBlockEncoder + ", " +
775             "isCompaction=" + isCompaction);
776       }
777 
778       blockBuffer = block.getBufferWithoutHeader();
779       readKeyValueLen();
780       blockFetches++;
781 
782       // Reset the next indexed key
783       this.nextIndexedKey = null;
784     }
785 
786     private final void readKeyValueLen() {
787       blockBuffer.mark();
788       currKeyLen = blockBuffer.getInt();
789       currValueLen = blockBuffer.getInt();
790       blockBuffer.reset();
791       if (this.reader.shouldIncludeMemstoreTS()) {
792         if (this.reader.decodeMemstoreTS) {
793           try {
794             int memstoreTSOffset = blockBuffer.arrayOffset()
795                 + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen
796                 + currValueLen;
797             currMemstoreTS = Bytes.readVLong(blockBuffer.array(),
798                 memstoreTSOffset);
799             currMemstoreTSLen = WritableUtils.getVIntSize(currMemstoreTS);
800           } catch (Exception e) {
801             throw new RuntimeException("Error reading memstore timestamp", e);
802           }
803         } else {
804           currMemstoreTS = 0;
805           currMemstoreTSLen = 1;
806         }
807       }
808 
809       if (currKeyLen < 0 || currValueLen < 0
810           || currKeyLen > blockBuffer.limit()
811           || currValueLen > blockBuffer.limit()) {
812         throw new IllegalStateException("Invalid currKeyLen " + currKeyLen
813             + " or currValueLen " + currValueLen + ". Block offset: "
814             + block.getOffset() + ", block length: " + blockBuffer.limit()
815             + ", position: " + blockBuffer.position() + " (without header).");
816       }
817     }
818 
819     /**
820      * Within a loaded block, seek looking for the last key that is smaller
821      * than (or equal to?) the key we are interested in.
822      *
823      * A note on the seekBefore: if you have seekBefore = true, AND the first
824      * key in the block = key, then you'll get thrown exceptions. The caller has
825      * to check for that case and load the previous block as appropriate.
826      *
827      * @param key the key to find
828      * @param seekBefore find the key before the given key in case of exact
829      *          match.
830      * @return 0 in case of an exact key match, 1 in case of an inexact match
831      */
832     private int blockSeek(byte[] key, int offset, int length,
833         boolean seekBefore) {
834       int klen, vlen;
835       long memstoreTS = 0;
836       int memstoreTSLen = 0;
837       int lastKeyValueSize = -1;
838       do {
839         blockBuffer.mark();
840         klen = blockBuffer.getInt();
841         vlen = blockBuffer.getInt();
842         blockBuffer.reset();
843         if (this.reader.shouldIncludeMemstoreTS()) {
844           if (this.reader.decodeMemstoreTS) {
845             try {
846               int memstoreTSOffset = blockBuffer.arrayOffset()
847                   + blockBuffer.position() + KEY_VALUE_LEN_SIZE + klen + vlen;
848               memstoreTS = Bytes.readVLong(blockBuffer.array(),
849                   memstoreTSOffset);
850               memstoreTSLen = WritableUtils.getVIntSize(memstoreTS);
851             } catch (Exception e) {
852               throw new RuntimeException("Error reading memstore timestamp", e);
853             }
854           } else {
855             memstoreTS = 0;
856             memstoreTSLen = 1;
857           }
858         }
859 
860         int keyOffset = blockBuffer.arrayOffset() + blockBuffer.position()
861             + KEY_VALUE_LEN_SIZE;
862         int comp = reader.getComparator().compare(key, offset, length,
863             blockBuffer.array(), keyOffset, klen);
864 
865         if (comp == 0) {
866           if (seekBefore) {
867             if (lastKeyValueSize < 0) {
868               throw new IllegalStateException("blockSeek with seekBefore "
869                   + "at the first key of the block: key="
870                   + Bytes.toStringBinary(key) + ", blockOffset="
871                   + block.getOffset() + ", onDiskSize="
872                   + block.getOnDiskSizeWithHeader());
873             }
874             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
875             readKeyValueLen();
876             return 1; // non exact match.
877           }
878           currKeyLen = klen;
879           currValueLen = vlen;
880           if (this.reader.shouldIncludeMemstoreTS()) {
881             currMemstoreTS = memstoreTS;
882             currMemstoreTSLen = memstoreTSLen;
883           }
884           return 0; // indicate exact match
885         }
886 
887         if (comp < 0) {
888           if (lastKeyValueSize > 0)
889             blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
890           readKeyValueLen();
891           return 1;
892         }
893 
894         // The size of this key/value tuple, including key/value length fields.
895         lastKeyValueSize = klen + vlen + memstoreTSLen + KEY_VALUE_LEN_SIZE;
896         blockBuffer.position(blockBuffer.position() + lastKeyValueSize);
897       } while (blockBuffer.remaining() > 0);
898 
899       // Seek to the last key we successfully read. This will happen if this is
900       // the last key/value pair in the file, in which case the following call
901       // to next() has to return false.
902       blockBuffer.position(blockBuffer.position() - lastKeyValueSize);
903       readKeyValueLen();
904       return 1; // didn't exactly find it.
905     }
906 
907     @Override
908     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
909       ByteBuffer buffer = curBlock.getBufferWithoutHeader();
910       // It is safe to manipulate this buffer because we own the buffer object.
911       buffer.rewind();
912       int klen = buffer.getInt();
913       buffer.getInt();
914       ByteBuffer keyBuff = buffer.slice();
915       keyBuff.limit(klen);
916       keyBuff.rewind();
917       return keyBuff;
918     }
919 
920     @Override
921     public String getKeyString() {
922       return Bytes.toStringBinary(blockBuffer.array(),
923           blockBuffer.arrayOffset() + blockBuffer.position()
924               + KEY_VALUE_LEN_SIZE, currKeyLen);
925     }
926 
927     @Override
928     public String getValueString() {
929       return Bytes.toString(blockBuffer.array(), blockBuffer.arrayOffset()
930           + blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
931           currValueLen);
932     }
933   }
934 
935   /**
936    * ScannerV2 that operates on encoded data blocks.
937    */
938   protected static class EncodedScannerV2 extends AbstractScannerV2 {
939     private DataBlockEncoder.EncodedSeeker seeker = null;
940     private DataBlockEncoder dataBlockEncoder = null;
941     private final boolean includesMemstoreTS;
942 
943     public EncodedScannerV2(HFileReaderV2 reader, boolean cacheBlocks,
944         boolean pread, boolean isCompaction, boolean includesMemstoreTS) {
945       super(reader, cacheBlocks, pread, isCompaction);
946       this.includesMemstoreTS = includesMemstoreTS;
947     }
948 
949     private void setDataBlockEncoder(DataBlockEncoder dataBlockEncoder) {
950       this.dataBlockEncoder = dataBlockEncoder;
951       seeker = dataBlockEncoder.createSeeker(reader.getComparator(),
952           includesMemstoreTS);
953     }
954 
955     /**
956      * Updates the current block to be the given {@link HFileBlock}. Seeks to
957      * the the first key/value pair.
958      *
959      * @param newBlock the block to make current
960      */
961     private void updateCurrentBlock(HFileBlock newBlock) {
962       block = newBlock;
963 
964       // sanity checks
965       if (block.getBlockType() != BlockType.ENCODED_DATA) {
966         throw new IllegalStateException(
967             "EncodedScannerV2 works only on encoded data blocks");
968       }
969 
970       short dataBlockEncoderId = block.getDataBlockEncodingId();
971       if (dataBlockEncoder == null ||
972           !DataBlockEncoding.isCorrectEncoder(dataBlockEncoder,
973               dataBlockEncoderId)) {
974         DataBlockEncoder encoder =
975             DataBlockEncoding.getDataBlockEncoderById(dataBlockEncoderId);
976         setDataBlockEncoder(encoder);
977       }
978 
979       seeker.setCurrentBuffer(getEncodedBuffer(newBlock));
980       blockFetches++;
981     }
982 
983     private ByteBuffer getEncodedBuffer(HFileBlock newBlock) {
984       ByteBuffer origBlock = newBlock.getBufferReadOnly();
985       ByteBuffer encodedBlock = ByteBuffer.wrap(origBlock.array(),
986           origBlock.arrayOffset() + newBlock.headerSize() +
987           DataBlockEncoding.ID_SIZE,
988           newBlock.getUncompressedSizeWithoutHeader() -
989           DataBlockEncoding.ID_SIZE).slice();
990       return encodedBlock;
991     }
992 
993     @Override
994     public boolean seekTo() throws IOException {
995       if (reader == null) {
996         return false;
997       }
998 
999       if (reader.getTrailer().getEntryCount() == 0) {
1000         // No data blocks.
1001         return false;
1002       }
1003 
1004       long firstDataBlockOffset =
1005           reader.getTrailer().getFirstDataBlockOffset();
1006       if (block != null && block.getOffset() == firstDataBlockOffset) {
1007         seeker.rewind();
1008         return true;
1009       }
1010 
1011       block = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
1012           isCompaction, BlockType.DATA);
1013       if (block.getOffset() < 0) {
1014         throw new IOException("Invalid block offset: " + block.getOffset());
1015       }
1016       updateCurrentBlock(block);
1017       return true;
1018     }
1019 
1020     @Override
1021     public boolean next() throws IOException {
1022       boolean isValid = seeker.next();
1023       if (!isValid) {
1024         block = readNextDataBlock();
1025         isValid = block != null;
1026         if (isValid) {
1027           updateCurrentBlock(block);
1028         }
1029       }
1030       return isValid;
1031     }
1032 
1033     @Override
1034     public ByteBuffer getKey() {
1035       assertValidSeek();
1036       return seeker.getKeyDeepCopy();
1037     }
1038 
1039     @Override
1040     public ByteBuffer getValue() {
1041       assertValidSeek();
1042       return seeker.getValueShallowCopy();
1043     }
1044 
1045     @Override
1046     public KeyValue getKeyValue() {
1047       if (block == null) {
1048         return null;
1049       }
1050       return seeker.getKeyValue();
1051     }
1052 
1053     @Override
1054     public String getKeyString() {
1055       ByteBuffer keyBuffer = getKey();
1056       return Bytes.toStringBinary(keyBuffer.array(),
1057           keyBuffer.arrayOffset(), keyBuffer.limit());
1058     }
1059 
1060     @Override
1061     public String getValueString() {
1062       ByteBuffer valueBuffer = getValue();
1063       return Bytes.toStringBinary(valueBuffer.array(),
1064           valueBuffer.arrayOffset(), valueBuffer.limit());
1065     }
1066 
1067     private void assertValidSeek() {
1068       if (block == null) {
1069         throw new NotSeekedException();
1070       }
1071     }
1072 
1073     @Override
1074     protected ByteBuffer getFirstKeyInBlock(HFileBlock curBlock) {
1075       return dataBlockEncoder.getFirstKeyInBlock(getEncodedBuffer(curBlock));
1076     }
1077 
1078     @Override
1079     protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, byte[] nextIndexedKey,
1080         boolean rewind, byte[] key, int offset, int length, boolean seekBefore)
1081         throws IOException  {
1082       if (block == null || block.getOffset() != seekToBlock.getOffset()) {
1083         updateCurrentBlock(seekToBlock);
1084       } else if (rewind) {
1085         seeker.rewind();
1086       }
1087       this.nextIndexedKey = nextIndexedKey;
1088       return seeker.seekToKeyInBlock(key, offset, length, seekBefore);
1089     }
1090   }
1091 
1092   /**
1093    * Returns a buffer with the Bloom filter metadata. The caller takes
1094    * ownership of the buffer.
1095    */
1096   @Override
1097   public DataInput getGeneralBloomFilterMetadata() throws IOException {
1098     return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1099   }
1100 
1101   @Override
1102   public DataInput getDeleteBloomFilterMetadata() throws IOException {
1103     return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1104   }
1105 
1106   private DataInput getBloomFilterMetadata(BlockType blockType)
1107   throws IOException {
1108     if (blockType != BlockType.GENERAL_BLOOM_META &&
1109         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
1110       throw new RuntimeException("Block Type: " + blockType.toString() +
1111           " is not supported") ;
1112     }
1113 
1114     for (HFileBlock b : loadOnOpenBlocks)
1115       if (b.getBlockType() == blockType)
1116         return b.getByteStream();
1117     return null;
1118   }
1119 
1120   @Override
1121   public boolean isFileInfoLoaded() {
1122     return true; // We load file info in constructor in version 2.
1123   }
1124 
1125   /**
1126    * Validates that the minor version is within acceptable limits.
1127    * Otherwise throws an Runtime exception
1128    */
1129   private void validateMinorVersion(Path path, int minorVersion) {
1130     if (minorVersion < MIN_MINOR_VERSION ||
1131         minorVersion > MAX_MINOR_VERSION) {
1132       String msg = "Minor version for path " + path + 
1133                    " is expected to be between " +
1134                    MIN_MINOR_VERSION + " and " + MAX_MINOR_VERSION +
1135                    " but is found to be " + minorVersion;
1136       LOG.error(msg);
1137       throw new RuntimeException(msg);
1138     }
1139   }
1140 }