View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.io.hfile;
19  
20  import java.io.ByteArrayInputStream;
21  import java.io.ByteArrayOutputStream;
22  import java.io.DataInputStream;
23  import java.io.DataOutput;
24  import java.io.DataOutputStream;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.nio.ByteBuffer;
28  import java.util.concurrent.locks.Lock;
29  import java.util.concurrent.locks.ReentrantLock;
30  
31  import org.apache.hadoop.classification.InterfaceAudience;
32  import org.apache.hadoop.fs.FSDataInputStream;
33  import org.apache.hadoop.fs.FSDataOutputStream;
34  import org.apache.hadoop.fs.Path;
35  import org.apache.hadoop.hbase.HConstants;
36  import org.apache.hadoop.hbase.fs.HFileSystem;
37  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
38  import org.apache.hadoop.hbase.io.compress.Compression;
39  import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
40  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
41  import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
42  import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultDecodingContext;
43  import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultEncodingContext;
44  import org.apache.hadoop.hbase.io.encoding.HFileBlockEncodingContext;
45  import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache;
46  import org.apache.hadoop.hbase.util.Bytes;
47  import org.apache.hadoop.hbase.util.ChecksumType;
48  import org.apache.hadoop.hbase.util.ClassSize;
49  import org.apache.hadoop.hbase.util.CompoundBloomFilter;
50  import org.apache.hadoop.io.IOUtils;
51  
52  import com.google.common.base.Preconditions;
53  
54  /**
55   * Reading {@link HFile} version 1 and 2 blocks, and writing version 2 blocks.
56   * <ul>
57   * <li>In version 1 all blocks are always compressed or uncompressed, as
58   * specified by the {@link HFile}'s compression algorithm, with a type-specific
59   * magic record stored in the beginning of the compressed data (i.e. one needs
60   * to uncompress the compressed block to determine the block type). There is
61   * only a single compression algorithm setting for all blocks. Offset and size
62   * information from the block index are required to read a block.
63   * <li>In version 2 a block is structured as follows:
64   * <ul>
65   * <li>Magic record identifying the block type (8 bytes)
66   * <li>Compressed block size, header not included (4 bytes)
67   * <li>Uncompressed block size, header not included (4 bytes)
68   * <li>The offset of the previous block of the same type (8 bytes). This is
69   * used to be able to navigate to the previous block without going to the block
70   * <li>For minorVersions >=1, there is an additional 4 byte field 
71   * bytesPerChecksum that records the number of bytes in a checksum chunk.
72   * <li>For minorVersions >=1, there is a 4 byte value to store the size of
73   * data on disk (excluding the checksums)
74   * <li>For minorVersions >=1, a series of 4 byte checksums, one each for
75   * the number of bytes specified by bytesPerChecksum.
76   * index.
77   * <li>Compressed data (or uncompressed data if compression is disabled). The
78   * compression algorithm is the same for all the blocks in the {@link HFile},
79   * similarly to what was done in version 1.
80   * </ul>
81   * </ul>
82   * The version 2 block representation in the block cache is the same as above,
83   * except that the data section is always uncompressed in the cache.
84   */
85  @InterfaceAudience.Private
86  public class HFileBlock implements Cacheable {
87  
88    /** Minor versions starting with this number have hbase checksums */
89    static final int MINOR_VERSION_WITH_CHECKSUM = 1;
90  
91    /** minor version that does not support checksums */
92    static final int MINOR_VERSION_NO_CHECKSUM = 0;
93  
94    /**
95     * On a checksum failure on a Reader, these many suceeding read
96     * requests switch back to using hdfs checksums before auto-reenabling
97     * hbase checksum verification.
98     */
99    static final int CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD = 3;
100 
101   public static final boolean FILL_HEADER = true;
102   public static final boolean DONT_FILL_HEADER = false;
103 
104   /**
105    * The size of block header when blockType is {@link BlockType#ENCODED_DATA}.
106    * This extends normal header by adding the id of encoder.
107    */
108   public static final int ENCODED_HEADER_SIZE = HConstants.HFILEBLOCK_HEADER_SIZE
109       + DataBlockEncoding.ID_SIZE;
110 
111   static final byte[] DUMMY_HEADER_NO_CHECKSUM =
112      new byte[HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM];
113 
114   public static final int BYTE_BUFFER_HEAP_SIZE = (int) ClassSize.estimateBase(
115       ByteBuffer.wrap(new byte[0], 0, 0).getClass(), false);
116 
117   // minorVersion+offset+nextBlockOnDiskSizeWithHeader
118   public static final int EXTRA_SERIALIZATION_SPACE = 2 * Bytes.SIZEOF_INT
119       + Bytes.SIZEOF_LONG;
120 
121   /**
122    * Each checksum value is an integer that can be stored in 4 bytes.
123    */
124   static final int CHECKSUM_SIZE = Bytes.SIZEOF_INT;
125 
126   private static final CacheableDeserializer<Cacheable> blockDeserializer =
127       new CacheableDeserializer<Cacheable>() {
128         public HFileBlock deserialize(ByteBuffer buf, boolean reuse) throws IOException{
129           buf.limit(buf.limit() - HFileBlock.EXTRA_SERIALIZATION_SPACE).rewind();
130           ByteBuffer newByteBuffer;
131           if (reuse) {
132             newByteBuffer = buf.slice();
133           } else {
134            newByteBuffer = ByteBuffer.allocate(buf.limit());
135            newByteBuffer.put(buf);
136           }
137           buf.position(buf.limit());
138           buf.limit(buf.limit() + HFileBlock.EXTRA_SERIALIZATION_SPACE);
139           int minorVersion=buf.getInt();
140           HFileBlock ourBuffer = new HFileBlock(newByteBuffer, minorVersion);
141           ourBuffer.offset = buf.getLong();
142           ourBuffer.nextBlockOnDiskSizeWithHeader = buf.getInt();
143           return ourBuffer;
144         }
145         
146         @Override
147         public int getDeserialiserIdentifier() {
148           return deserializerIdentifier;
149         }
150 
151         @Override
152         public HFileBlock deserialize(ByteBuffer b) throws IOException {
153           return deserialize(b, false);
154         }
155       };
156   private static final int deserializerIdentifier;
157   static {
158     deserializerIdentifier = CacheableDeserializerIdManager
159         .registerDeserializer(blockDeserializer);
160   }
161 
162   private BlockType blockType;
163 
164   /** Size on disk without the header. It includes checksum data too. */
165   private int onDiskSizeWithoutHeader;
166 
167   /** Size of pure data. Does not include header or checksums */
168   private final int uncompressedSizeWithoutHeader;
169 
170   /** The offset of the previous block on disk */
171   private final long prevBlockOffset;
172 
173   /** The Type of checksum, better to store the byte than an object */
174   private final byte checksumType;
175 
176   /** The number of bytes for which a checksum is computed */
177   private final int bytesPerChecksum;
178 
179   /** Size on disk of header and data. Does not include checksum data */
180   private final int onDiskDataSizeWithHeader;
181 
182   /** The minor version of the hfile. */
183   private final int minorVersion;
184 
185   /** The in-memory representation of the hfile block */
186   private ByteBuffer buf;
187 
188   /** Whether there is a memstore timestamp after every key/value */
189   private boolean includesMemstoreTS;
190 
191   /**
192    * The offset of this block in the file. Populated by the reader for
193    * convenience of access. This offset is not part of the block header.
194    */
195   private long offset = -1;
196 
197   /**
198    * The on-disk size of the next block, including the header, obtained by
199    * peeking into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the next block's
200    * header, or -1 if unknown.
201    */
202   private int nextBlockOnDiskSizeWithHeader = -1;
203 
204   /**
205    * Creates a new {@link HFile} block from the given fields. This constructor
206    * is mostly used when the block data has already been read and uncompressed,
207    * and is sitting in a byte buffer. 
208    *
209    * @param blockType the type of this block, see {@link BlockType}
210    * @param onDiskSizeWithoutHeader compressed size of the block if compression
211    *          is used, otherwise uncompressed size, header size not included
212    * @param uncompressedSizeWithoutHeader uncompressed size of the block,
213    *          header size not included. Equals onDiskSizeWithoutHeader if
214    *          compression is disabled.
215    * @param prevBlockOffset the offset of the previous block in the
216    *          {@link HFile}
217    * @param buf block header ({@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes) followed by
218    *          uncompressed data. This
219    * @param fillHeader true to fill in the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of
220    *          the buffer based on the header fields provided
221    * @param offset the file offset the block was read from
222    * @param minorVersion the minor version of this block
223    * @param bytesPerChecksum the number of bytes per checksum chunk
224    * @param checksumType the checksum algorithm to use
225    * @param onDiskDataSizeWithHeader size of header and data on disk not
226    *        including checksum data
227    */
228   HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader,
229       int uncompressedSizeWithoutHeader, long prevBlockOffset, ByteBuffer buf,
230       boolean fillHeader, long offset, boolean includesMemstoreTS, 
231       int minorVersion, int bytesPerChecksum, byte checksumType,
232       int onDiskDataSizeWithHeader) {
233     this.blockType = blockType;
234     this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader;
235     this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader;
236     this.prevBlockOffset = prevBlockOffset;
237     this.buf = buf;
238     if (fillHeader)
239       overwriteHeader();
240     this.offset = offset;
241     this.includesMemstoreTS = includesMemstoreTS;
242     this.minorVersion = minorVersion;
243     this.bytesPerChecksum = bytesPerChecksum;
244     this.checksumType = checksumType;
245     this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader;
246   }
247 
248   /**
249    * Creates a block from an existing buffer starting with a header. Rewinds
250    * and takes ownership of the buffer. By definition of rewind, ignores the
251    * buffer position, but if you slice the buffer beforehand, it will rewind
252    * to that point. The reason this has a minorNumber and not a majorNumber is
253    * because majorNumbers indicate the format of a HFile whereas minorNumbers 
254    * indicate the format inside a HFileBlock.
255    */
256   HFileBlock(ByteBuffer b, int minorVersion) throws IOException {
257     b.rewind();
258     blockType = BlockType.read(b);
259     onDiskSizeWithoutHeader = b.getInt();
260     uncompressedSizeWithoutHeader = b.getInt();
261     prevBlockOffset = b.getLong();
262     this.minorVersion = minorVersion;
263     if (minorVersion >= MINOR_VERSION_WITH_CHECKSUM) {
264       this.checksumType = b.get();
265       this.bytesPerChecksum = b.getInt();
266       this.onDiskDataSizeWithHeader = b.getInt();
267     } else {
268       this.checksumType = ChecksumType.NULL.getCode();
269       this.bytesPerChecksum = 0;
270       this.onDiskDataSizeWithHeader = onDiskSizeWithoutHeader +
271                                        HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM;
272     }
273     buf = b;
274     buf.rewind();
275   }
276 
277   public BlockType getBlockType() {
278     return blockType;
279   }
280 
281   /** @return get data block encoding id that was used to encode this block */
282   public short getDataBlockEncodingId() {
283     if (blockType != BlockType.ENCODED_DATA) {
284       throw new IllegalArgumentException("Querying encoder ID of a block " +
285           "of type other than " + BlockType.ENCODED_DATA + ": " + blockType);
286     }
287     return buf.getShort(headerSize());
288   }
289 
290   /**
291    * @return the on-disk size of the block with header size included. This
292    * includes the header, the data and the checksum data.
293    */
294   public int getOnDiskSizeWithHeader() {
295     return onDiskSizeWithoutHeader + headerSize();
296   }
297 
298   /**
299    * Returns the size of the compressed part of the block in case compression
300    * is used, or the uncompressed size of the data part otherwise. Header size
301    * and checksum data size is not included.
302    *
303    * @return the on-disk size of the data part of the block, header and
304    *         checksum not included. 
305    */
306   public int getOnDiskSizeWithoutHeader() {
307     return onDiskSizeWithoutHeader;
308   }
309 
310   /**
311    * @return the uncompressed size of the data part of the block, header not
312    *         included
313    */
314    public int getUncompressedSizeWithoutHeader() {
315     return uncompressedSizeWithoutHeader;
316   }
317 
318   /**
319    * @return the offset of the previous block of the same type in the file, or
320    *         -1 if unknown
321    */
322   public long getPrevBlockOffset() {
323     return prevBlockOffset;
324   }
325 
326   /**
327    * Writes header fields into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the
328    * buffer. Resets the buffer position to the end of header as side effect.
329    */
330   private void overwriteHeader() {
331     buf.rewind();
332     blockType.write(buf);
333     buf.putInt(onDiskSizeWithoutHeader);
334     buf.putInt(uncompressedSizeWithoutHeader);
335     buf.putLong(prevBlockOffset);
336   }
337 
338   /**
339    * Returns a buffer that does not include the header. The array offset points
340    * to the start of the block data right after the header. The underlying data
341    * array is not copied. Checksum data is not included in the returned buffer.
342    *
343    * @return the buffer with header skipped
344    */
345   public ByteBuffer getBufferWithoutHeader() {
346     return ByteBuffer.wrap(buf.array(), buf.arrayOffset() + headerSize(),
347         buf.limit() - headerSize() - totalChecksumBytes()).slice();
348   }
349 
350   /**
351    * Returns the buffer this block stores internally. The clients must not
352    * modify the buffer object. This method has to be public because it is
353    * used in {@link CompoundBloomFilter} to avoid object creation on every
354    * Bloom filter lookup, but has to be used with caution. Checksum data
355    * is not included in the returned buffer.
356    *
357    * @return the buffer of this block for read-only operations
358    */
359   public ByteBuffer getBufferReadOnly() {
360     return ByteBuffer.wrap(buf.array(), buf.arrayOffset(),
361         buf.limit() - totalChecksumBytes()).slice();
362   }
363 
364   /**
365    * Returns the buffer of this block, including header data. The clients must
366    * not modify the buffer object. This method has to be public because it is
367    * used in {@link BucketCache} to avoid buffer copy.
368    * 
369    * @return the byte buffer with header included for read-only operations
370    */
371   public ByteBuffer getBufferReadOnlyWithHeader() {
372     return ByteBuffer.wrap(buf.array(), buf.arrayOffset(), buf.limit()).slice();
373   }
374 
375   /**
376    * Returns a byte buffer of this block, including header data, positioned at
377    * the beginning of header. The underlying data array is not copied.
378    *
379    * @return the byte buffer with header included
380    */
381   ByteBuffer getBufferWithHeader() {
382     ByteBuffer dupBuf = buf.duplicate();
383     dupBuf.rewind();
384     return dupBuf;
385   }
386 
387   private void sanityCheckAssertion(long valueFromBuf, long valueFromField,
388       String fieldName) throws IOException {
389     if (valueFromBuf != valueFromField) {
390       throw new AssertionError(fieldName + " in the buffer (" + valueFromBuf
391           + ") is different from that in the field (" + valueFromField + ")");
392     }
393   }
394 
395   /**
396    * Checks if the block is internally consistent, i.e. the first
397    * {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the buffer contain a valid header consistent
398    * with the fields. This function is primary for testing and debugging, and
399    * is not thread-safe, because it alters the internal buffer pointer.
400    */
401   void sanityCheck() throws IOException {
402     buf.rewind();
403 
404     {
405       BlockType blockTypeFromBuf = BlockType.read(buf);
406       if (blockTypeFromBuf != blockType) {
407         throw new IOException("Block type stored in the buffer: " +
408             blockTypeFromBuf + ", block type field: " + blockType);
409       }
410     }
411 
412     sanityCheckAssertion(buf.getInt(), onDiskSizeWithoutHeader,
413         "onDiskSizeWithoutHeader");
414 
415     sanityCheckAssertion(buf.getInt(), uncompressedSizeWithoutHeader,
416         "uncompressedSizeWithoutHeader");
417 
418     sanityCheckAssertion(buf.getLong(), prevBlockOffset, "prevBlocKOffset");
419     if (minorVersion >= MINOR_VERSION_WITH_CHECKSUM) {
420       sanityCheckAssertion(buf.get(), checksumType, "checksumType");
421       sanityCheckAssertion(buf.getInt(), bytesPerChecksum, "bytesPerChecksum");
422       sanityCheckAssertion(buf.getInt(), onDiskDataSizeWithHeader, 
423                            "onDiskDataSizeWithHeader");
424     }
425 
426     int cksumBytes = totalChecksumBytes();
427     int hdrSize = headerSize();
428     int expectedBufLimit = uncompressedSizeWithoutHeader + headerSize() +
429                            cksumBytes;
430     if (buf.limit() != expectedBufLimit) {
431       throw new AssertionError("Expected buffer limit " + expectedBufLimit
432           + ", got " + buf.limit());
433     }
434 
435     // We might optionally allocate HFILEBLOCK_HEADER_SIZE more bytes to read the next
436     // block's, header, so there are two sensible values for buffer capacity.
437     int size = uncompressedSizeWithoutHeader + hdrSize + cksumBytes;
438     if (buf.capacity() != size &&
439         buf.capacity() != size + hdrSize) {
440       throw new AssertionError("Invalid buffer capacity: " + buf.capacity() +
441           ", expected " + size + " or " + (size + hdrSize));
442     }
443   }
444 
445   @Override
446   public String toString() {
447     return "blockType="
448         + blockType
449         + ", onDiskSizeWithoutHeader="
450         + onDiskSizeWithoutHeader
451         + ", uncompressedSizeWithoutHeader="
452         + uncompressedSizeWithoutHeader
453         + ", prevBlockOffset="
454         + prevBlockOffset
455         + ", dataBeginsWith="
456         + Bytes.toStringBinary(buf.array(), buf.arrayOffset() + headerSize(),
457             Math.min(32, buf.limit() - buf.arrayOffset() - headerSize()))
458         + ", fileOffset=" + offset;
459   }
460 
461   private void validateOnDiskSizeWithoutHeader(
462       int expectedOnDiskSizeWithoutHeader) throws IOException {
463     if (onDiskSizeWithoutHeader != expectedOnDiskSizeWithoutHeader) {
464       String blockInfoMsg =
465         "Block offset: " + offset + ", data starts with: "
466           + Bytes.toStringBinary(buf.array(), buf.arrayOffset(),
467               buf.arrayOffset() + Math.min(32, buf.limit()));
468       throw new IOException("On-disk size without header provided is "
469           + expectedOnDiskSizeWithoutHeader + ", but block "
470           + "header contains " + onDiskSizeWithoutHeader + ". " +
471           blockInfoMsg);
472     }
473   }
474 
475   /**
476    * Always allocates a new buffer of the correct size. Copies header bytes
477    * from the existing buffer. Does not change header fields. 
478    * Reserve room to keep checksum bytes too.
479    *
480    * @param extraBytes whether to reserve room in the buffer to read the next
481    *          block's header
482    */
483   private void allocateBuffer(boolean extraBytes) {
484     int cksumBytes = totalChecksumBytes();
485     int capacityNeeded = headerSize() + uncompressedSizeWithoutHeader +
486         cksumBytes +
487         (extraBytes ? headerSize() : 0);
488 
489     ByteBuffer newBuf = ByteBuffer.allocate(capacityNeeded);
490 
491     // Copy header bytes.
492     System.arraycopy(buf.array(), buf.arrayOffset(), newBuf.array(),
493         newBuf.arrayOffset(), headerSize());
494 
495     buf = newBuf;
496     buf.limit(headerSize() + uncompressedSizeWithoutHeader + cksumBytes);
497   }
498 
499   /** An additional sanity-check in case no compression is being used. */
500   public void assumeUncompressed() throws IOException {
501     if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + 
502         totalChecksumBytes()) {
503       throw new IOException("Using no compression but "
504           + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", "
505           + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader
506           + ", numChecksumbytes=" + totalChecksumBytes());
507     }
508   }
509 
510   /**
511    * @param expectedType the expected type of this block
512    * @throws IOException if this block's type is different than expected
513    */
514   public void expectType(BlockType expectedType) throws IOException {
515     if (blockType != expectedType) {
516       throw new IOException("Invalid block type: expected=" + expectedType
517           + ", actual=" + blockType);
518     }
519   }
520 
521   /** @return the offset of this block in the file it was read from */
522   public long getOffset() {
523     if (offset < 0) {
524       throw new IllegalStateException(
525           "HFile block offset not initialized properly");
526     }
527     return offset;
528   }
529 
530   /**
531    * @return a byte stream reading the data section of this block
532    */
533   public DataInputStream getByteStream() {
534     return new DataInputStream(new ByteArrayInputStream(buf.array(),
535         buf.arrayOffset() + headerSize(), buf.limit() - headerSize()));
536   }
537 
538   @Override
539   public long heapSize() {
540     long size = ClassSize.align(
541         ClassSize.OBJECT +
542         // Block type and byte buffer references
543         2 * ClassSize.REFERENCE +
544         // On-disk size, uncompressed size, and next block's on-disk size
545         // bytePerChecksum,  onDiskDataSize and minorVersion
546         6 * Bytes.SIZEOF_INT +
547         // Checksum type
548         1 * Bytes.SIZEOF_BYTE +
549         // This and previous block offset
550         2 * Bytes.SIZEOF_LONG +
551         // "Include memstore timestamp" flag
552         Bytes.SIZEOF_BOOLEAN
553     );
554 
555     if (buf != null) {
556       // Deep overhead of the byte buffer. Needs to be aligned separately.
557       size += ClassSize.align(buf.capacity() + BYTE_BUFFER_HEAP_SIZE);
558     }
559 
560     return ClassSize.align(size);
561   }
562 
563   /**
564    * Read from an input stream. Analogous to
565    * {@link IOUtils#readFully(InputStream, byte[], int, int)}, but specifies a
566    * number of "extra" bytes that would be desirable but not absolutely
567    * necessary to read.
568    *
569    * @param in the input stream to read from
570    * @param buf the buffer to read into
571    * @param bufOffset the destination offset in the buffer
572    * @param necessaryLen the number of bytes that are absolutely necessary to
573    *          read
574    * @param extraLen the number of extra bytes that would be nice to read
575    * @return true if succeeded reading the extra bytes
576    * @throws IOException if failed to read the necessary bytes
577    */
578   public static boolean readWithExtra(InputStream in, byte buf[],
579       int bufOffset, int necessaryLen, int extraLen) throws IOException {
580     int bytesRemaining = necessaryLen + extraLen;
581     while (bytesRemaining > 0) {
582       int ret = in.read(buf, bufOffset, bytesRemaining);
583       if (ret == -1 && bytesRemaining <= extraLen) {
584         // We could not read the "extra data", but that is OK.
585         break;
586       }
587 
588       if (ret < 0) {
589         throw new IOException("Premature EOF from inputStream (read "
590             + "returned " + ret + ", was trying to read " + necessaryLen
591             + " necessary bytes and " + extraLen + " extra bytes, "
592             + "successfully read "
593             + (necessaryLen + extraLen - bytesRemaining));
594       }
595       bufOffset += ret;
596       bytesRemaining -= ret;
597     }
598     return bytesRemaining <= 0;
599   }
600 
601   /**
602    * @return the on-disk size of the next block (including the header size)
603    *         that was read by peeking into the next block's header
604    */
605   public int getNextBlockOnDiskSizeWithHeader() {
606     return nextBlockOnDiskSizeWithHeader;
607   }
608 
609 
610   /**
611    * Unified version 2 {@link HFile} block writer. The intended usage pattern
612    * is as follows:
613    * <ol>
614    * <li>Construct an {@link HFileBlock.Writer}, providing a compression algorithm.
615    * <li>Call {@link Writer#startWriting} and get a data stream to write to.
616    * <li>Write your data into the stream.
617    * <li>Call {@link Writer#writeHeaderAndData(FSDataOutputStream)} as many times as you need to.
618    * store the serialized block into an external stream.
619    * <li>Repeat to write more blocks.
620    * </ol>
621    * <p>
622    */
623   public static class Writer {
624 
625     private enum State {
626       INIT,
627       WRITING,
628       BLOCK_READY
629     };
630 
631     /** Writer state. Used to ensure the correct usage protocol. */
632     private State state = State.INIT;
633 
634     /** Data block encoder used for data blocks */
635     private final HFileDataBlockEncoder dataBlockEncoder;
636 
637     private HFileBlockEncodingContext dataBlockEncodingCtx;
638 
639     /** block encoding context for non-data blocks */
640     private HFileBlockDefaultEncodingContext defaultBlockEncodingCtx;
641 
642     /**
643      * The stream we use to accumulate data in uncompressed format for each
644      * block. We reset this stream at the end of each block and reuse it. The
645      * header is written as the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes into this
646      * stream.
647      */
648     private ByteArrayOutputStream baosInMemory;
649 
650     /**
651      * Current block type. Set in {@link #startWriting(BlockType)}. Could be
652      * changed in {@link #encodeDataBlockForDisk()} from {@link BlockType#DATA}
653      * to {@link BlockType#ENCODED_DATA}.
654      */
655     private BlockType blockType;
656 
657     /**
658      * A stream that we write uncompressed bytes to, which compresses them and
659      * writes them to {@link #baosInMemory}.
660      */
661     private DataOutputStream userDataStream;
662 
663     /**
664      * Bytes to be written to the file system, including the header. Compressed
665      * if compression is turned on. It also includes the checksum data that 
666      * immediately follows the block data. (header + data + checksums)
667      */
668     private byte[] onDiskBytesWithHeader;
669 
670     /**
671      * The size of the checksum data on disk. It is used only if data is
672      * not compressed. If data is compressed, then the checksums are already
673      * part of onDiskBytesWithHeader. If data is uncompressed, then this
674      * variable stores the checksum data for this block.
675      */
676     private byte[] onDiskChecksum;
677 
678     /**
679      * Valid in the READY state. Contains the header and the uncompressed (but
680      * potentially encoded, if this is a data block) bytes, so the length is
681      * {@link #uncompressedSizeWithoutHeader} + {@link org.apache.hadoop.hbase.HConstants#HFILEBLOCK_HEADER_SIZE}.
682      * Does not store checksums.
683      */
684     private byte[] uncompressedBytesWithHeader;
685 
686     /**
687      * Current block's start offset in the {@link HFile}. Set in
688      * {@link #writeHeaderAndData(FSDataOutputStream)}.
689      */
690     private long startOffset;
691 
692     /**
693      * Offset of previous block by block type. Updated when the next block is
694      * started.
695      */
696     private long[] prevOffsetByType;
697 
698     /** The offset of the previous block of the same type */
699     private long prevOffset;
700 
701     /** Whether we are including memstore timestamp after every key/value */
702     private boolean includesMemstoreTS;
703 
704     /** Checksum settings */
705     private ChecksumType checksumType;
706     private int bytesPerChecksum;
707 
708     /**
709      * @param compressionAlgorithm compression algorithm to use
710      * @param dataBlockEncoder data block encoding algorithm to use
711      * @param checksumType type of checksum
712      * @param bytesPerChecksum bytes per checksum
713      */
714     public Writer(Compression.Algorithm compressionAlgorithm,
715           HFileDataBlockEncoder dataBlockEncoder, boolean includesMemstoreTS,
716           ChecksumType checksumType, int bytesPerChecksum) {
717       this.dataBlockEncoder = dataBlockEncoder != null
718           ? dataBlockEncoder : NoOpDataBlockEncoder.INSTANCE;
719       defaultBlockEncodingCtx =
720         new HFileBlockDefaultEncodingContext(compressionAlgorithm, null, HConstants.HFILEBLOCK_DUMMY_HEADER);
721       dataBlockEncodingCtx =
722         this.dataBlockEncoder.newDataBlockEncodingContext(
723             compressionAlgorithm, HConstants.HFILEBLOCK_DUMMY_HEADER);
724 
725       if (bytesPerChecksum < HConstants.HFILEBLOCK_HEADER_SIZE) {
726         throw new RuntimeException("Unsupported value of bytesPerChecksum. " +
727             " Minimum is " + HConstants.HFILEBLOCK_HEADER_SIZE + " but the configured value is " +
728             bytesPerChecksum);
729       }
730 
731       baosInMemory = new ByteArrayOutputStream();
732       
733       prevOffsetByType = new long[BlockType.values().length];
734       for (int i = 0; i < prevOffsetByType.length; ++i)
735         prevOffsetByType[i] = -1;
736 
737       this.includesMemstoreTS = includesMemstoreTS;
738       this.checksumType = checksumType;
739       this.bytesPerChecksum = bytesPerChecksum;
740     }
741 
742     /**
743      * Starts writing into the block. The previous block's data is discarded.
744      *
745      * @return the stream the user can write their data into
746      * @throws IOException
747      */
748     public DataOutputStream startWriting(BlockType newBlockType)
749         throws IOException {
750       if (state == State.BLOCK_READY && startOffset != -1) {
751         // We had a previous block that was written to a stream at a specific
752         // offset. Save that offset as the last offset of a block of that type.
753         prevOffsetByType[blockType.getId()] = startOffset;
754       }
755 
756       startOffset = -1;
757       blockType = newBlockType;
758 
759       baosInMemory.reset();
760       baosInMemory.write(HConstants.HFILEBLOCK_DUMMY_HEADER);
761 
762       state = State.WRITING;
763 
764       // We will compress it later in finishBlock()
765       userDataStream = new DataOutputStream(baosInMemory);
766       return userDataStream;
767     }
768 
769     /**
770      * Returns the stream for the user to write to. The block writer takes care
771      * of handling compression and buffering for caching on write. Can only be
772      * called in the "writing" state.
773      *
774      * @return the data output stream for the user to write to
775      */
776     DataOutputStream getUserDataStream() {
777       expectState(State.WRITING);
778       return userDataStream;
779     }
780 
781     /**
782      * Transitions the block writer from the "writing" state to the "block
783      * ready" state.  Does nothing if a block is already finished.
784      */
785     private void ensureBlockReady() throws IOException {
786       Preconditions.checkState(state != State.INIT,
787           "Unexpected state: " + state);
788 
789       if (state == State.BLOCK_READY)
790         return;
791 
792       // This will set state to BLOCK_READY.
793       finishBlock();
794     }
795 
796     /**
797      * An internal method that flushes the compressing stream (if using
798      * compression), serializes the header, and takes care of the separate
799      * uncompressed stream for caching on write, if applicable. Sets block
800      * write state to "block ready".
801      */
802     private void finishBlock() throws IOException {
803       userDataStream.flush();
804       // This does an array copy, so it is safe to cache this byte array.
805       uncompressedBytesWithHeader = baosInMemory.toByteArray();
806       prevOffset = prevOffsetByType[blockType.getId()];
807 
808       // We need to set state before we can package the block up for
809       // cache-on-write. In a way, the block is ready, but not yet encoded or
810       // compressed.
811       state = State.BLOCK_READY;
812       if (blockType == BlockType.DATA) {
813         encodeDataBlockForDisk();
814       } else {
815         defaultBlockEncodingCtx.compressAfterEncodingWithBlockType(
816             uncompressedBytesWithHeader, blockType);
817         onDiskBytesWithHeader =
818           defaultBlockEncodingCtx.getOnDiskBytesWithHeader();
819       }
820 
821       int numBytes = (int) ChecksumUtil.numBytes(
822           onDiskBytesWithHeader.length,
823           bytesPerChecksum);
824 
825       // put the header for on disk bytes
826       putHeader(onDiskBytesWithHeader, 0,
827           onDiskBytesWithHeader.length + numBytes,
828           uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length);
829       // set the header for the uncompressed bytes (for cache-on-write)
830       putHeader(uncompressedBytesWithHeader, 0,
831           onDiskBytesWithHeader.length + numBytes,
832           uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length);
833 
834       onDiskChecksum = new byte[numBytes];
835       ChecksumUtil.generateChecksums(
836           onDiskBytesWithHeader, 0, onDiskBytesWithHeader.length,
837           onDiskChecksum, 0, checksumType, bytesPerChecksum);
838     }
839 
840     /**
841      * Encodes this block if it is a data block and encoding is turned on in
842      * {@link #dataBlockEncoder}.
843      */
844     private void encodeDataBlockForDisk() throws IOException {
845       // do data block encoding, if data block encoder is set
846       ByteBuffer rawKeyValues =
847           ByteBuffer.wrap(uncompressedBytesWithHeader, HConstants.HFILEBLOCK_HEADER_SIZE,
848               uncompressedBytesWithHeader.length - HConstants.HFILEBLOCK_HEADER_SIZE).slice();
849 
850       //do the encoding
851       dataBlockEncoder.beforeWriteToDisk(rawKeyValues,
852               includesMemstoreTS, dataBlockEncodingCtx, blockType);
853 
854       uncompressedBytesWithHeader =
855           dataBlockEncodingCtx.getUncompressedBytesWithHeader();
856       onDiskBytesWithHeader =
857           dataBlockEncodingCtx.getOnDiskBytesWithHeader();
858       blockType = dataBlockEncodingCtx.getBlockType();
859     }
860 
861     /**
862      * Put the header into the given byte array at the given offset.
863      * @param onDiskSize size of the block on disk header + data + checksum
864      * @param uncompressedSize size of the block after decompression (but
865      *          before optional data block decoding) including header
866      * @param onDiskDataSize size of the block on disk with header
867      *        and data but not including the checksums
868      */
869     private void putHeader(byte[] dest, int offset, int onDiskSize,
870         int uncompressedSize, int onDiskDataSize) {
871       offset = blockType.put(dest, offset);
872       offset = Bytes.putInt(dest, offset, onDiskSize - HConstants.HFILEBLOCK_HEADER_SIZE);
873       offset = Bytes.putInt(dest, offset, uncompressedSize - HConstants.HFILEBLOCK_HEADER_SIZE);
874       offset = Bytes.putLong(dest, offset, prevOffset);
875       offset = Bytes.putByte(dest, offset, checksumType.getCode());
876       offset = Bytes.putInt(dest, offset, bytesPerChecksum);
877       Bytes.putInt(dest, offset, onDiskDataSize);
878     }
879 
880     /**
881      * Similar to {@link #writeHeaderAndData(FSDataOutputStream)}, but records
882      * the offset of this block so that it can be referenced in the next block
883      * of the same type.
884      *
885      * @param out
886      * @throws IOException
887      */
888     public void writeHeaderAndData(FSDataOutputStream out) throws IOException {
889       long offset = out.getPos();
890       if (startOffset != -1 && offset != startOffset) {
891         throw new IOException("A " + blockType + " block written to a "
892             + "stream twice, first at offset " + startOffset + ", then at "
893             + offset);
894       }
895       startOffset = offset;
896 
897       finishBlockAndWriteHeaderAndData((DataOutputStream) out);
898     }
899 
900     /**
901      * Writes the header and the compressed data of this block (or uncompressed
902      * data when not using compression) into the given stream. Can be called in
903      * the "writing" state or in the "block ready" state. If called in the
904      * "writing" state, transitions the writer to the "block ready" state.
905      *
906      * @param out the output stream to write the
907      * @throws IOException
908      */
909     private void finishBlockAndWriteHeaderAndData(DataOutputStream out)
910       throws IOException {
911       ensureBlockReady();
912       out.write(onDiskBytesWithHeader);
913       out.write(onDiskChecksum);
914     }
915 
916     /**
917      * Returns the header or the compressed data (or uncompressed data when not
918      * using compression) as a byte array. Can be called in the "writing" state
919      * or in the "block ready" state. If called in the "writing" state,
920      * transitions the writer to the "block ready" state. This returns
921      * the header + data + checksums stored on disk.
922      *
923      * @return header and data as they would be stored on disk in a byte array
924      * @throws IOException
925      */
926     byte[] getHeaderAndDataForTest() throws IOException {
927       ensureBlockReady();
928       // This is not very optimal, because we are doing an extra copy.
929       // But this method is used only by unit tests.
930       byte[] output =
931           new byte[onDiskBytesWithHeader.length
932               + onDiskChecksum.length];
933       System.arraycopy(onDiskBytesWithHeader, 0, output, 0,
934           onDiskBytesWithHeader.length);
935       System.arraycopy(onDiskChecksum, 0, output,
936           onDiskBytesWithHeader.length, onDiskChecksum.length);
937       return output;
938     }
939 
940     /**
941      * Releases resources used by this writer.
942      */
943     public void release() {
944       if (dataBlockEncodingCtx != null) {
945         dataBlockEncodingCtx.close();
946         dataBlockEncodingCtx = null;
947       }
948       if (defaultBlockEncodingCtx != null) {
949         defaultBlockEncodingCtx.close();
950         defaultBlockEncodingCtx = null;
951       }
952     }
953 
954     /**
955      * Returns the on-disk size of the data portion of the block. This is the
956      * compressed size if compression is enabled. Can only be called in the
957      * "block ready" state. Header is not compressed, and its size is not
958      * included in the return value.
959      *
960      * @return the on-disk size of the block, not including the header.
961      */
962     int getOnDiskSizeWithoutHeader() {
963       expectState(State.BLOCK_READY);
964       return onDiskBytesWithHeader.length + onDiskChecksum.length - HConstants.HFILEBLOCK_HEADER_SIZE;
965     }
966 
967     /**
968      * Returns the on-disk size of the block. Can only be called in the
969      * "block ready" state.
970      *
971      * @return the on-disk size of the block ready to be written, including the
972      *         header size, the data and the checksum data.
973      */
974     int getOnDiskSizeWithHeader() {
975       expectState(State.BLOCK_READY);
976       return onDiskBytesWithHeader.length + onDiskChecksum.length;
977     }
978 
979     /**
980      * The uncompressed size of the block data. Does not include header size.
981      */
982     int getUncompressedSizeWithoutHeader() {
983       expectState(State.BLOCK_READY);
984       return uncompressedBytesWithHeader.length - HConstants.HFILEBLOCK_HEADER_SIZE;
985     }
986 
987     /**
988      * The uncompressed size of the block data, including header size.
989      */
990     int getUncompressedSizeWithHeader() {
991       expectState(State.BLOCK_READY);
992       return uncompressedBytesWithHeader.length;
993     }
994 
995     /** @return true if a block is being written  */
996     public boolean isWriting() {
997       return state == State.WRITING;
998     }
999 
1000     /**
1001      * Returns the number of bytes written into the current block so far, or
1002      * zero if not writing the block at the moment. Note that this will return
1003      * zero in the "block ready" state as well.
1004      *
1005      * @return the number of bytes written
1006      */
1007     public int blockSizeWritten() {
1008       if (state != State.WRITING)
1009         return 0;
1010       return userDataStream.size();
1011     }
1012 
1013     /**
1014      * Returns the header followed by the uncompressed data, even if using
1015      * compression. This is needed for storing uncompressed blocks in the block
1016      * cache. Can be called in the "writing" state or the "block ready" state.
1017      * Returns only the header and data, does not include checksum data.
1018      *
1019      * @return uncompressed block bytes for caching on write
1020      */
1021     ByteBuffer getUncompressedBufferWithHeader() {
1022       expectState(State.BLOCK_READY);
1023       return ByteBuffer.wrap(uncompressedBytesWithHeader);
1024     }
1025 
1026     private void expectState(State expectedState) {
1027       if (state != expectedState) {
1028         throw new IllegalStateException("Expected state: " + expectedState +
1029             ", actual state: " + state);
1030       }
1031     }
1032 
1033     /**
1034      * Takes the given {@link BlockWritable} instance, creates a new block of
1035      * its appropriate type, writes the writable into this block, and flushes
1036      * the block into the output stream. The writer is instructed not to buffer
1037      * uncompressed bytes for cache-on-write.
1038      *
1039      * @param bw the block-writable object to write as a block
1040      * @param out the file system output stream
1041      * @throws IOException
1042      */
1043     public void writeBlock(BlockWritable bw, FSDataOutputStream out)
1044         throws IOException {
1045       bw.writeToBlock(startWriting(bw.getBlockType()));
1046       writeHeaderAndData(out);
1047     }
1048 
1049     /**
1050      * Creates a new HFileBlock. Checksums have already been validated, so
1051      * the byte buffer passed into the constructor of this newly created
1052      * block does not have checksum data even though the header minor 
1053      * version is MINOR_VERSION_WITH_CHECKSUM. This is indicated by setting a
1054      * 0 value in bytesPerChecksum.
1055      */
1056     public HFileBlock getBlockForCaching() {
1057       return new HFileBlock(blockType, getOnDiskSizeWithoutHeader(),
1058           getUncompressedSizeWithoutHeader(), prevOffset,
1059           getUncompressedBufferWithHeader(), DONT_FILL_HEADER, startOffset,
1060           includesMemstoreTS, MINOR_VERSION_WITH_CHECKSUM,
1061           0, ChecksumType.NULL.getCode(),  // no checksums in cached data
1062           onDiskBytesWithHeader.length + onDiskChecksum.length);
1063     }
1064   }
1065 
1066   /** Something that can be written into a block. */
1067   public interface BlockWritable {
1068 
1069     /** The type of block this data should use. */
1070     BlockType getBlockType();
1071 
1072     /**
1073      * Writes the block to the provided stream. Must not write any magic
1074      * records.
1075      *
1076      * @param out a stream to write uncompressed data into
1077      */
1078     void writeToBlock(DataOutput out) throws IOException;
1079   }
1080 
1081   // Block readers and writers
1082 
1083   /** An interface allowing to iterate {@link HFileBlock}s. */
1084   public interface BlockIterator {
1085 
1086     /**
1087      * Get the next block, or null if there are no more blocks to iterate.
1088      */
1089     HFileBlock nextBlock() throws IOException;
1090 
1091     /**
1092      * Similar to {@link #nextBlock()} but checks block type, throws an
1093      * exception if incorrect, and returns the HFile block
1094      */
1095     HFileBlock nextBlockWithBlockType(BlockType blockType) throws IOException;
1096   }
1097 
1098   /** A full-fledged reader with iteration ability. */
1099   public interface FSReader {
1100 
1101     /**
1102      * Reads the block at the given offset in the file with the given on-disk
1103      * size and uncompressed size.
1104      *
1105      * @param offset
1106      * @param onDiskSize the on-disk size of the entire block, including all
1107      *          applicable headers, or -1 if unknown
1108      * @param uncompressedSize the uncompressed size of the compressed part of
1109      *          the block, or -1 if unknown
1110      * @return the newly read block
1111      */
1112     HFileBlock readBlockData(long offset, long onDiskSize,
1113         int uncompressedSize, boolean pread) throws IOException;
1114 
1115     /**
1116      * Creates a block iterator over the given portion of the {@link HFile}.
1117      * The iterator returns blocks starting with offset such that offset <=
1118      * startOffset < endOffset.
1119      *
1120      * @param startOffset the offset of the block to start iteration with
1121      * @param endOffset the offset to end iteration at (exclusive)
1122      * @return an iterator of blocks between the two given offsets
1123      */
1124     BlockIterator blockRange(long startOffset, long endOffset);
1125 
1126     /** Closes the backing streams */
1127     void closeStreams() throws IOException;
1128   }
1129 
1130   /**
1131    * A common implementation of some methods of {@link FSReader} and some
1132    * tools for implementing HFile format version-specific block readers.
1133    */
1134   private abstract static class AbstractFSReader implements FSReader {
1135     /** Compression algorithm used by the {@link HFile} */
1136     protected Compression.Algorithm compressAlgo;
1137 
1138     /** The size of the file we are reading from, or -1 if unknown. */
1139     protected long fileSize;
1140 
1141     /** The minor version of this reader */
1142     private int minorVersion;
1143 
1144     /** The size of the header */
1145     protected final int hdrSize;
1146 
1147     /** The filesystem used to access data */
1148     protected HFileSystem hfs;
1149 
1150     /** The path (if any) where this data is coming from */
1151     protected Path path;
1152 
1153     private final Lock streamLock = new ReentrantLock();
1154 
1155     /** The default buffer size for our buffered streams */
1156     public static final int DEFAULT_BUFFER_SIZE = 1 << 20;
1157 
1158     public AbstractFSReader(Algorithm compressAlgo, long fileSize, int minorVersion,
1159         HFileSystem hfs, Path path) throws IOException {
1160       this.compressAlgo = compressAlgo;
1161       this.fileSize = fileSize;
1162       this.minorVersion = minorVersion;
1163       this.hfs = hfs;
1164       this.path = path;
1165       this.hdrSize = headerSize(minorVersion);
1166     }
1167 
1168     @Override
1169     public BlockIterator blockRange(final long startOffset,
1170         final long endOffset) {
1171       return new BlockIterator() {
1172         private long offset = startOffset;
1173 
1174         @Override
1175         public HFileBlock nextBlock() throws IOException {
1176           if (offset >= endOffset)
1177             return null;
1178           HFileBlock b = readBlockData(offset, -1, -1, false);
1179           offset += b.getOnDiskSizeWithHeader();
1180           return b;
1181         }
1182 
1183         @Override
1184         public HFileBlock nextBlockWithBlockType(BlockType blockType)
1185             throws IOException {
1186           HFileBlock blk = nextBlock();
1187           if (blk.getBlockType() != blockType) {
1188             throw new IOException("Expected block of type " + blockType
1189                 + " but found " + blk.getBlockType());
1190           }
1191           return blk;
1192         }
1193       };
1194     }
1195 
1196     /**
1197      * Does a positional read or a seek and read into the given buffer. Returns
1198      * the on-disk size of the next block, or -1 if it could not be determined.
1199      *
1200      * @param dest destination buffer
1201      * @param destOffset offset in the destination buffer
1202      * @param size size of the block to be read
1203      * @param peekIntoNextBlock whether to read the next block's on-disk size
1204      * @param fileOffset position in the stream to read at
1205      * @param pread whether we should do a positional read
1206      * @param istream The input source of data
1207      * @return the on-disk size of the next block with header size included, or
1208      *         -1 if it could not be determined
1209      * @throws IOException
1210      */
1211     protected int readAtOffset(FSDataInputStream istream,
1212         byte[] dest, int destOffset, int size,
1213         boolean peekIntoNextBlock, long fileOffset, boolean pread)
1214         throws IOException {
1215       if (peekIntoNextBlock &&
1216           destOffset + size + hdrSize > dest.length) {
1217         // We are asked to read the next block's header as well, but there is
1218         // not enough room in the array.
1219         throw new IOException("Attempted to read " + size + " bytes and " +
1220             hdrSize + " bytes of next header into a " + dest.length +
1221             "-byte array at offset " + destOffset);
1222       }
1223 
1224       if (!pread && streamLock.tryLock()) {
1225         // Seek + read. Better for scanning.
1226         try {
1227           istream.seek(fileOffset);
1228 
1229           long realOffset = istream.getPos();
1230           if (realOffset != fileOffset) {
1231             throw new IOException("Tried to seek to " + fileOffset + " to "
1232                 + "read " + size + " bytes, but pos=" + realOffset
1233                 + " after seek");
1234           }
1235 
1236           if (!peekIntoNextBlock) {
1237             IOUtils.readFully(istream, dest, destOffset, size);
1238             return -1;
1239           }
1240 
1241           // Try to read the next block header.
1242           if (!readWithExtra(istream, dest, destOffset, size, hdrSize))
1243             return -1;
1244         } finally {
1245           streamLock.unlock();
1246         }
1247       } else {
1248         // Positional read. Better for random reads; or when the streamLock is already locked.
1249         int extraSize = peekIntoNextBlock ? hdrSize : 0;
1250 
1251         int ret = istream.read(fileOffset, dest, destOffset, size + extraSize);
1252         if (ret < size) {
1253           throw new IOException("Positional read of " + size + " bytes " +
1254               "failed at offset " + fileOffset + " (returned " + ret + ")");
1255         }
1256 
1257         if (ret == size || ret < size + extraSize) {
1258           // Could not read the next block's header, or did not try.
1259           return -1;
1260         }
1261       }
1262 
1263       assert peekIntoNextBlock;
1264       return Bytes.toInt(dest, destOffset + size + BlockType.MAGIC_LENGTH) +
1265           hdrSize;
1266     }
1267 
1268     /**
1269      * @return The minorVersion of this HFile
1270      */
1271     protected int getMinorVersion() {
1272       return minorVersion;
1273     }
1274   }
1275 
1276   /**
1277    * We always prefetch the header of the next block, so that we know its
1278    * on-disk size in advance and can read it in one operation.
1279    */
1280   private static class PrefetchedHeader {
1281     long offset = -1;
1282     byte[] header = new byte[HConstants.HFILEBLOCK_HEADER_SIZE];
1283     ByteBuffer buf = ByteBuffer.wrap(header, 0, HConstants.HFILEBLOCK_HEADER_SIZE);
1284   }
1285 
1286   /** Reads version 2 blocks from the filesystem. */
1287   static class FSReaderV2 extends AbstractFSReader {
1288     /** The file system stream of the underlying {@link HFile} that 
1289      * does or doesn't do checksum validations in the filesystem */
1290     protected FSDataInputStreamWrapper streamWrapper;
1291 
1292     /** Whether we include memstore timestamp in data blocks */
1293     protected boolean includesMemstoreTS;
1294 
1295     /** Data block encoding used to read from file */
1296     protected HFileDataBlockEncoder dataBlockEncoder =
1297         NoOpDataBlockEncoder.INSTANCE;
1298 
1299     private HFileBlockDecodingContext encodedBlockDecodingCtx;
1300 
1301     private HFileBlockDefaultDecodingContext defaultDecodingCtx;
1302 
1303     private ThreadLocal<PrefetchedHeader> prefetchedHeaderForThread =
1304         new ThreadLocal<PrefetchedHeader>() {
1305           @Override
1306           public PrefetchedHeader initialValue() {
1307             return new PrefetchedHeader();
1308           }
1309         };
1310 
1311     public FSReaderV2(FSDataInputStreamWrapper stream, Algorithm compressAlgo, long fileSize,
1312         int minorVersion, HFileSystem hfs, Path path) throws IOException {
1313       super(compressAlgo, fileSize, minorVersion, hfs, path);
1314       this.streamWrapper = stream;
1315       // Older versions of HBase didn't support checksum.
1316       boolean forceNoHBaseChecksum = (this.getMinorVersion() < MINOR_VERSION_WITH_CHECKSUM);
1317       this.streamWrapper.prepareForBlockReader(forceNoHBaseChecksum);
1318 
1319       defaultDecodingCtx =
1320         new HFileBlockDefaultDecodingContext(compressAlgo);
1321       encodedBlockDecodingCtx =
1322           new HFileBlockDefaultDecodingContext(compressAlgo);
1323     }
1324 
1325     /**
1326      * A constructor that reads files with the latest minor version.
1327      * This is used by unit tests only.
1328      */
1329     FSReaderV2(FSDataInputStream istream, Algorithm compressAlgo,
1330         long fileSize) throws IOException {
1331       this(new FSDataInputStreamWrapper(istream), compressAlgo, fileSize,
1332            HFileReaderV2.MAX_MINOR_VERSION, null, null);
1333     }
1334 
1335     /**
1336      * Reads a version 2 block. Tries to do as little memory allocation as
1337      * possible, using the provided on-disk size.
1338      *
1339      * @param offset the offset in the stream to read at
1340      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
1341      *          the header, or -1 if unknown
1342      * @param uncompressedSize the uncompressed size of the the block. Always
1343      *          expected to be -1. This parameter is only used in version 1.
1344      * @param pread whether to use a positional read
1345      */
1346     @Override
1347     public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL,
1348         int uncompressedSize, boolean pread) throws IOException {
1349 
1350       // get a copy of the current state of whether to validate
1351       // hbase checksums or not for this read call. This is not 
1352       // thread-safe but the one constaint is that if we decide 
1353       // to skip hbase checksum verification then we are 
1354       // guaranteed to use hdfs checksum verification.
1355       boolean doVerificationThruHBaseChecksum = streamWrapper.shouldUseHBaseChecksum();
1356       FSDataInputStream is = streamWrapper.getStream(doVerificationThruHBaseChecksum);
1357 
1358       HFileBlock blk = readBlockDataInternal(is, offset, 
1359                          onDiskSizeWithHeaderL, 
1360                          uncompressedSize, pread,
1361                          doVerificationThruHBaseChecksum);
1362       if (blk == null) {
1363         HFile.LOG.warn("HBase checksum verification failed for file " +
1364                        path + " at offset " +
1365                        offset + " filesize " + fileSize +
1366                        ". Retrying read with HDFS checksums turned on...");
1367 
1368         if (!doVerificationThruHBaseChecksum) {
1369           String msg = "HBase checksum verification failed for file " +
1370                        path + " at offset " +
1371                        offset + " filesize " + fileSize + 
1372                        " but this cannot happen because doVerify is " +
1373                        doVerificationThruHBaseChecksum;
1374           HFile.LOG.warn(msg);
1375           throw new IOException(msg); // cannot happen case here
1376         }
1377         HFile.checksumFailures.incrementAndGet(); // update metrics
1378 
1379         // If we have a checksum failure, we fall back into a mode where
1380         // the next few reads use HDFS level checksums. We aim to make the
1381         // next CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD reads avoid
1382         // hbase checksum verification, but since this value is set without
1383         // holding any locks, it can so happen that we might actually do
1384         // a few more than precisely this number.
1385         is = this.streamWrapper.fallbackToFsChecksum(CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD);
1386         doVerificationThruHBaseChecksum = false;
1387         blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL,
1388                                     uncompressedSize, pread,
1389                                     doVerificationThruHBaseChecksum);
1390         if (blk != null) {
1391           HFile.LOG.warn("HDFS checksum verification suceeded for file " +
1392                          path + " at offset " +
1393                          offset + " filesize " + fileSize);
1394         }
1395       } 
1396       if (blk == null && !doVerificationThruHBaseChecksum) {
1397         String msg = "readBlockData failed, possibly due to " +
1398                      "checksum verification failed for file " + path +
1399                      " at offset " + offset + " filesize " + fileSize;
1400         HFile.LOG.warn(msg);
1401         throw new IOException(msg);
1402       }
1403 
1404       // If there is a checksum mismatch earlier, then retry with 
1405       // HBase checksums switched off and use HDFS checksum verification.
1406       // This triggers HDFS to detect and fix corrupt replicas. The
1407       // next checksumOffCount read requests will use HDFS checksums.
1408       // The decrementing of this.checksumOffCount is not thread-safe,
1409       // but it is harmless because eventually checksumOffCount will be
1410       // a negative number.
1411       streamWrapper.checksumOk();
1412       return blk;
1413     }
1414 
1415     /**
1416      * Reads a version 2 block. 
1417      *
1418      * @param offset the offset in the stream to read at
1419      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
1420      *          the header, or -1 if unknown
1421      * @param uncompressedSize the uncompressed size of the the block. Always
1422      *          expected to be -1. This parameter is only used in version 1.
1423      * @param pread whether to use a positional read
1424      * @param verifyChecksum Whether to use HBase checksums. 
1425      *        If HBase checksum is switched off, then use HDFS checksum.
1426      * @return the HFileBlock or null if there is a HBase checksum mismatch
1427      */
1428     private HFileBlock readBlockDataInternal(FSDataInputStream is, long offset, 
1429         long onDiskSizeWithHeaderL, int uncompressedSize, boolean pread,
1430         boolean verifyChecksum) throws IOException {
1431       if (offset < 0) {
1432         throw new IOException("Invalid offset=" + offset + " trying to read "
1433             + "block (onDiskSize=" + onDiskSizeWithHeaderL
1434             + ", uncompressedSize=" + uncompressedSize + ")");
1435       }
1436       if (uncompressedSize != -1) {
1437         throw new IOException("Version 2 block reader API does not need " +
1438             "the uncompressed size parameter");
1439       }
1440 
1441       if ((onDiskSizeWithHeaderL < hdrSize && onDiskSizeWithHeaderL != -1)
1442           || onDiskSizeWithHeaderL >= Integer.MAX_VALUE) {
1443         throw new IOException("Invalid onDisksize=" + onDiskSizeWithHeaderL
1444             + ": expected to be at least " + hdrSize
1445             + " and at most " + Integer.MAX_VALUE + ", or -1 (offset="
1446             + offset + ", uncompressedSize=" + uncompressedSize + ")");
1447       }
1448 
1449       int onDiskSizeWithHeader = (int) onDiskSizeWithHeaderL;
1450       // See if we can avoid reading the header. This is desirable, because
1451       // we will not incur a backward seek operation if we have already
1452       // read this block's header as part of the previous read's look-ahead.
1453       // And we also want to skip reading the header again if it has already
1454       // been read.
1455       PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get();
1456       ByteBuffer headerBuf = prefetchedHeader.offset == offset ?
1457           prefetchedHeader.buf : null;
1458 
1459       int nextBlockOnDiskSize = 0;
1460       // Allocate enough space to fit the next block's header too.
1461       byte[] onDiskBlock = null;
1462 
1463       HFileBlock b = null;
1464       if (onDiskSizeWithHeader > 0) {
1465         // We know the total on-disk size but not the uncompressed size. Read
1466         // the entire block into memory, then parse the header and decompress
1467         // from memory if using compression. This code path is used when
1468         // doing a random read operation relying on the block index, as well as
1469         // when the client knows the on-disk size from peeking into the next
1470         // block's header (e.g. this block's header) when reading the previous
1471         // block. This is the faster and more preferable case.
1472 
1473         // Size that we have to skip in case we have already read the header.
1474         int preReadHeaderSize = headerBuf == null ? 0 : hdrSize;
1475         onDiskBlock = new byte[onDiskSizeWithHeader + hdrSize];
1476         nextBlockOnDiskSize = readAtOffset(is, onDiskBlock,
1477             preReadHeaderSize, onDiskSizeWithHeader - preReadHeaderSize,
1478             true, offset + preReadHeaderSize, pread);
1479         if (headerBuf != null) {
1480           // the header has been read when reading the previous block, copy
1481           // to this block's header
1482           System.arraycopy(headerBuf.array(),
1483               headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize);
1484         } else {
1485           headerBuf = ByteBuffer.wrap(onDiskBlock, 0, hdrSize);
1486         }
1487         // We know the total on-disk size but not the uncompressed size. Read
1488         // the entire block into memory, then parse the header and decompress
1489         // from memory if using compression. Here we have already read the
1490         // block's header
1491         try {
1492           b = new HFileBlock(headerBuf, getMinorVersion());
1493         } catch (IOException ex) {
1494           // Seen in load testing. Provide comprehensive debug info.
1495           throw new IOException("Failed to read compressed block at "
1496               + offset
1497               + ", onDiskSizeWithoutHeader="
1498               + onDiskSizeWithHeader
1499               + ", preReadHeaderSize="
1500               + hdrSize
1501               + ", header.length="
1502               + prefetchedHeader.header.length
1503               + ", header bytes: "
1504               + Bytes.toStringBinary(prefetchedHeader.header, 0,
1505                   hdrSize), ex);
1506         }
1507         // if the caller specifies a onDiskSizeWithHeader, validate it.
1508         int onDiskSizeWithoutHeader = onDiskSizeWithHeader - hdrSize;
1509         assert onDiskSizeWithoutHeader >= 0;
1510         b.validateOnDiskSizeWithoutHeader(onDiskSizeWithoutHeader);
1511       } else {
1512         // Check headerBuf to see if we have read this block's header as part of
1513         // reading the previous block. This is an optimization of peeking into
1514         // the next block's header (e.g.this block's header) when reading the
1515         // previous block. This is the faster and more preferable case. If the
1516         // header is already there, don't read the header again.
1517 
1518         // Unfortunately, we still have to do a separate read operation to
1519         // read the header.
1520         if (headerBuf == null) {
1521           // From the header, determine the on-disk size of the given hfile
1522           // block, and read the remaining data, thereby incurring two read
1523           // operations. This might happen when we are doing the first read
1524           // in a series of reads or a random read, and we don't have access
1525           // to the block index. This is costly and should happen very rarely.
1526           headerBuf = ByteBuffer.allocate(hdrSize);
1527           readAtOffset(is, headerBuf.array(), headerBuf.arrayOffset(),
1528               hdrSize, false, offset, pread);
1529         }
1530 
1531         b = new HFileBlock(headerBuf, getMinorVersion());
1532         onDiskBlock = new byte[b.getOnDiskSizeWithHeader() + hdrSize];
1533         System.arraycopy(headerBuf.array(),
1534               headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize);
1535         nextBlockOnDiskSize =
1536           readAtOffset(is, onDiskBlock, hdrSize, b.getOnDiskSizeWithHeader()
1537               - hdrSize, true, offset + hdrSize, pread);
1538         onDiskSizeWithHeader = b.onDiskSizeWithoutHeader + hdrSize;
1539       }
1540 
1541       boolean isCompressed =
1542         compressAlgo != null
1543             && compressAlgo != Compression.Algorithm.NONE;
1544       if (!isCompressed) {
1545         b.assumeUncompressed();
1546       }
1547 
1548       if (verifyChecksum &&
1549           !validateBlockChecksum(b, onDiskBlock, hdrSize)) {
1550         return null;             // checksum mismatch
1551       }
1552 
1553       if (isCompressed) {
1554         // This will allocate a new buffer but keep header bytes.
1555         b.allocateBuffer(nextBlockOnDiskSize > 0);
1556         if (b.blockType == BlockType.ENCODED_DATA) {
1557           encodedBlockDecodingCtx.prepareDecoding(b.getOnDiskSizeWithoutHeader(),
1558               b.getUncompressedSizeWithoutHeader(), b.getBufferWithoutHeader(), onDiskBlock,
1559               hdrSize);
1560         } else {
1561           defaultDecodingCtx.prepareDecoding(b.getOnDiskSizeWithoutHeader(),
1562               b.getUncompressedSizeWithoutHeader(), b.getBufferWithoutHeader(), onDiskBlock,
1563               hdrSize);
1564         }
1565         if (nextBlockOnDiskSize > 0) {
1566           // Copy next block's header bytes into the new block if we have them.
1567           System.arraycopy(onDiskBlock, onDiskSizeWithHeader, b.buf.array(),
1568               b.buf.arrayOffset() + hdrSize
1569               + b.uncompressedSizeWithoutHeader + b.totalChecksumBytes(),
1570               hdrSize);
1571         }
1572       } else {
1573         // The onDiskBlock will become the headerAndDataBuffer for this block.
1574         // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already
1575         // contains the header of next block, so no need to set next
1576         // block's header in it.
1577         b = new HFileBlock(ByteBuffer.wrap(onDiskBlock, 0,
1578                 onDiskSizeWithHeader), getMinorVersion());
1579       }
1580 
1581       b.nextBlockOnDiskSizeWithHeader = nextBlockOnDiskSize;
1582 
1583       // Set prefetched header
1584       if (b.nextBlockOnDiskSizeWithHeader > 0) {
1585         prefetchedHeader.offset = offset + b.getOnDiskSizeWithHeader();
1586         System.arraycopy(onDiskBlock, onDiskSizeWithHeader,
1587             prefetchedHeader.header, 0, hdrSize);
1588       }
1589 
1590       b.includesMemstoreTS = includesMemstoreTS;
1591       b.offset = offset;
1592       return b;
1593     }
1594 
1595     void setIncludesMemstoreTS(boolean enabled) {
1596       includesMemstoreTS = enabled;
1597     }
1598 
1599     void setDataBlockEncoder(HFileDataBlockEncoder encoder) {
1600       this.dataBlockEncoder = encoder;
1601       encodedBlockDecodingCtx = encoder.newDataBlockDecodingContext(
1602           this.compressAlgo);
1603     }
1604 
1605     /**
1606      * Generates the checksum for the header as well as the data and
1607      * then validates that it matches the value stored in the header.
1608      * If there is a checksum mismatch, then return false. Otherwise
1609      * return true.
1610      */
1611     protected boolean validateBlockChecksum(HFileBlock block, 
1612       byte[] data, int hdrSize) throws IOException {
1613       return ChecksumUtil.validateBlockChecksum(path, block,
1614                                                 data, hdrSize);
1615     }
1616 
1617     @Override
1618     public void closeStreams() throws IOException {
1619       streamWrapper.close();
1620     }
1621   }
1622 
1623   @Override
1624   public int getSerializedLength() {
1625     if (buf != null) {
1626       return this.buf.limit() + HFileBlock.EXTRA_SERIALIZATION_SPACE;
1627     }
1628     return 0;
1629   }
1630 
1631   @Override
1632   public void serialize(ByteBuffer destination) {
1633     ByteBuffer dupBuf = this.buf.duplicate();
1634     dupBuf.rewind();
1635     destination.put(dupBuf);
1636     destination.putInt(this.minorVersion);
1637     destination.putLong(this.offset);
1638     destination.putInt(this.nextBlockOnDiskSizeWithHeader);
1639     destination.rewind();
1640   }
1641 
1642   public void serializeExtraInfo(ByteBuffer destination) {
1643     destination.putInt(this.minorVersion);
1644     destination.putLong(this.offset);
1645     destination.putInt(this.nextBlockOnDiskSizeWithHeader);
1646     destination.rewind();
1647   }
1648 
1649   @Override
1650   public CacheableDeserializer<Cacheable> getDeserializer() {
1651     return HFileBlock.blockDeserializer;
1652   }
1653 
1654   @Override
1655   public boolean equals(Object comparison) {
1656     if (this == comparison) {
1657       return true;
1658     }
1659     if (comparison == null) {
1660       return false;
1661     }
1662     if (comparison.getClass() != this.getClass()) {
1663       return false;
1664     }
1665 
1666     HFileBlock castedComparison = (HFileBlock) comparison;
1667 
1668     if (castedComparison.blockType != this.blockType) {
1669       return false;
1670     }
1671     if (castedComparison.nextBlockOnDiskSizeWithHeader != this.nextBlockOnDiskSizeWithHeader) {
1672       return false;
1673     }
1674     if (castedComparison.offset != this.offset) {
1675       return false;
1676     }
1677     if (castedComparison.onDiskSizeWithoutHeader != this.onDiskSizeWithoutHeader) {
1678       return false;
1679     }
1680     if (castedComparison.prevBlockOffset != this.prevBlockOffset) {
1681       return false;
1682     }
1683     if (castedComparison.uncompressedSizeWithoutHeader != this.uncompressedSizeWithoutHeader) {
1684       return false;
1685     }
1686     if (this.buf.compareTo(castedComparison.buf) != 0) {
1687       return false;
1688     }
1689     if (this.buf.position() != castedComparison.buf.position()){
1690       return false;
1691     }
1692     if (this.buf.limit() != castedComparison.buf.limit()){
1693       return false;
1694     }
1695     return true;
1696   }
1697 
1698   public boolean doesIncludeMemstoreTS() {
1699     return includesMemstoreTS;
1700   }
1701 
1702   public DataBlockEncoding getDataBlockEncoding() {
1703     if (blockType == BlockType.ENCODED_DATA) {
1704       return DataBlockEncoding.getEncodingById(getDataBlockEncodingId());
1705     }
1706     return DataBlockEncoding.NONE;
1707   }
1708 
1709   byte getChecksumType() {
1710     return this.checksumType;
1711   }
1712 
1713   int getBytesPerChecksum() {
1714     return this.bytesPerChecksum;
1715   }
1716 
1717   int getOnDiskDataSizeWithHeader() {
1718     return this.onDiskDataSizeWithHeader;
1719   }
1720 
1721   int getMinorVersion() {
1722     return this.minorVersion;
1723   }
1724 
1725   /** 
1726    * Calcuate the number of bytes required to store all the checksums
1727    * for this block. Each checksum value is a 4 byte integer.
1728    */
1729   int totalChecksumBytes() {
1730     // If the hfile block has minorVersion 0, then there are no checksum
1731     // data to validate. Similarly, a zero value in this.bytesPerChecksum
1732     // indicates that cached blocks do not have checksum data because
1733     // checksums were already validated when the block was read from disk.
1734     if (minorVersion < MINOR_VERSION_WITH_CHECKSUM || this.bytesPerChecksum == 0) {
1735       return 0;
1736     }
1737     return (int)ChecksumUtil.numBytes(onDiskDataSizeWithHeader, bytesPerChecksum);
1738   }
1739 
1740   /**
1741    * Returns the size of this block header.
1742    */
1743   public int headerSize() {
1744     return headerSize(this.minorVersion);
1745   }
1746 
1747   /**
1748    * Maps a minor version to the size of the header.
1749    */
1750   public static int headerSize(int minorVersion) {
1751     if (minorVersion < MINOR_VERSION_WITH_CHECKSUM) {
1752       return HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM;
1753     }
1754     return HConstants.HFILEBLOCK_HEADER_SIZE;
1755   }
1756 
1757   /**
1758    * Return the appropriate DUMMY_HEADER for the minor version
1759    */
1760   public byte[] getDummyHeaderForVersion() {
1761     return getDummyHeaderForVersion(minorVersion);
1762   }
1763 
1764   /**
1765    * Return the appropriate DUMMY_HEADER for the minor version
1766    */
1767   static private byte[] getDummyHeaderForVersion(int minorVersion) {
1768     if (minorVersion < MINOR_VERSION_WITH_CHECKSUM) {
1769       return DUMMY_HEADER_NO_CHECKSUM;
1770     }
1771     return HConstants.HFILEBLOCK_DUMMY_HEADER;
1772   }
1773 
1774   /**
1775    * Convert the contents of the block header into a human readable string.
1776    * This is mostly helpful for debugging. This assumes that the block
1777    * has minor version > 0.
1778    */
1779   static String toStringHeader(ByteBuffer buf) throws IOException {
1780     int offset = buf.arrayOffset();
1781     byte[] b = buf.array();
1782     long magic = Bytes.toLong(b, offset); 
1783     BlockType bt = BlockType.read(buf);
1784     offset += Bytes.SIZEOF_LONG;
1785     int compressedBlockSizeNoHeader = Bytes.toInt(b, offset);
1786     offset += Bytes.SIZEOF_INT;
1787     int uncompressedBlockSizeNoHeader = Bytes.toInt(b, offset);
1788     offset += Bytes.SIZEOF_INT;
1789     long prevBlockOffset = Bytes.toLong(b, offset); 
1790     offset += Bytes.SIZEOF_LONG;
1791     byte cksumtype = b[offset];
1792     offset += Bytes.SIZEOF_BYTE;
1793     long bytesPerChecksum = Bytes.toInt(b, offset); 
1794     offset += Bytes.SIZEOF_INT;
1795     long onDiskDataSizeWithHeader = Bytes.toInt(b, offset); 
1796     offset += Bytes.SIZEOF_INT;
1797     return " Header dump: magic: " + magic +
1798                    " blockType " + bt +
1799                    " compressedBlockSizeNoHeader " + 
1800                    compressedBlockSizeNoHeader +
1801                    " uncompressedBlockSizeNoHeader " + 
1802                    uncompressedBlockSizeNoHeader +
1803                    " prevBlockOffset " + prevBlockOffset +
1804                    " checksumType " + ChecksumType.codeToType(cksumtype) +
1805                    " bytesPerChecksum " + bytesPerChecksum +
1806                    " onDiskDataSizeWithHeader " + onDiskDataSizeWithHeader;
1807   }
1808 }
1809