View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.io.hfile;
19  
20  import java.io.BufferedInputStream;
21  import java.io.ByteArrayInputStream;
22  import java.io.ByteArrayOutputStream;
23  import java.io.DataInputStream;
24  import java.io.DataOutput;
25  import java.io.DataOutputStream;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.nio.ByteBuffer;
29  import java.util.concurrent.locks.Lock;
30  import java.util.concurrent.locks.ReentrantLock;
31  
32  import org.apache.hadoop.classification.InterfaceAudience;
33  import org.apache.hadoop.fs.FSDataInputStream;
34  import org.apache.hadoop.fs.FSDataOutputStream;
35  import org.apache.hadoop.fs.Path;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.fs.HFileSystem;
38  import org.apache.hadoop.hbase.io.compress.Compression;
39  import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
40  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
41  import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
42  import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultDecodingContext;
43  import org.apache.hadoop.hbase.io.encoding.HFileBlockDefaultEncodingContext;
44  import org.apache.hadoop.hbase.io.encoding.HFileBlockEncodingContext;
45  import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache;
46  import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
47  import org.apache.hadoop.hbase.util.Bytes;
48  import org.apache.hadoop.hbase.util.ChecksumType;
49  import org.apache.hadoop.hbase.util.ClassSize;
50  import org.apache.hadoop.hbase.util.CompoundBloomFilter;
51  import org.apache.hadoop.io.IOUtils;
52  
53  import com.google.common.base.Preconditions;
54  
55  /**
56   * Reading {@link HFile} version 1 and 2 blocks, and writing version 2 blocks.
57   * <ul>
58   * <li>In version 1 all blocks are always compressed or uncompressed, as
59   * specified by the {@link HFile}'s compression algorithm, with a type-specific
60   * magic record stored in the beginning of the compressed data (i.e. one needs
61   * to uncompress the compressed block to determine the block type). There is
62   * only a single compression algorithm setting for all blocks. Offset and size
63   * information from the block index are required to read a block.
64   * <li>In version 2 a block is structured as follows:
65   * <ul>
66   * <li>Magic record identifying the block type (8 bytes)
67   * <li>Compressed block size, header not included (4 bytes)
68   * <li>Uncompressed block size, header not included (4 bytes)
69   * <li>The offset of the previous block of the same type (8 bytes). This is
70   * used to be able to navigate to the previous block without going to the block
71   * <li>For minorVersions >=1, there is an additional 4 byte field 
72   * bytesPerChecksum that records the number of bytes in a checksum chunk.
73   * <li>For minorVersions >=1, there is a 4 byte value to store the size of
74   * data on disk (excluding the checksums)
75   * <li>For minorVersions >=1, a series of 4 byte checksums, one each for
76   * the number of bytes specified by bytesPerChecksum.
77   * index.
78   * <li>Compressed data (or uncompressed data if compression is disabled). The
79   * compression algorithm is the same for all the blocks in the {@link HFile},
80   * similarly to what was done in version 1.
81   * </ul>
82   * </ul>
83   * The version 2 block representation in the block cache is the same as above,
84   * except that the data section is always uncompressed in the cache.
85   */
86  @InterfaceAudience.Private
87  public class HFileBlock implements Cacheable {
88  
89    /** Minor versions starting with this number have hbase checksums */
90    static final int MINOR_VERSION_WITH_CHECKSUM = 1;
91  
92    /** minor version that does not support checksums */
93    static final int MINOR_VERSION_NO_CHECKSUM = 0;
94  
95    /**
96     * On a checksum failure on a Reader, these many suceeding read
97     * requests switch back to using hdfs checksums before auto-reenabling
98     * hbase checksum verification.
99     */
100   static final int CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD = 3;
101 
102   public static final boolean FILL_HEADER = true;
103   public static final boolean DONT_FILL_HEADER = false;
104 
105   /**
106    * The size of block header when blockType is {@link BlockType#ENCODED_DATA}.
107    * This extends normal header by adding the id of encoder.
108    */
109   public static final int ENCODED_HEADER_SIZE = HConstants.HFILEBLOCK_HEADER_SIZE
110       + DataBlockEncoding.ID_SIZE;
111 
112   static final byte[] DUMMY_HEADER_NO_CHECKSUM =
113      new byte[HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM];
114 
115   public static final int BYTE_BUFFER_HEAP_SIZE = (int) ClassSize.estimateBase(
116       ByteBuffer.wrap(new byte[0], 0, 0).getClass(), false);
117 
118   // minorVersion+offset+nextBlockOnDiskSizeWithHeader
119   public static final int EXTRA_SERIALIZATION_SPACE = 2 * Bytes.SIZEOF_INT
120       + Bytes.SIZEOF_LONG;
121 
122   /**
123    * Each checksum value is an integer that can be stored in 4 bytes.
124    */
125   static final int CHECKSUM_SIZE = Bytes.SIZEOF_INT;
126 
127   private static final CacheableDeserializer<Cacheable> blockDeserializer =
128       new CacheableDeserializer<Cacheable>() {
129         public HFileBlock deserialize(ByteBuffer buf, boolean reuse) throws IOException{
130           buf.limit(buf.limit() - HFileBlock.EXTRA_SERIALIZATION_SPACE).rewind();
131           ByteBuffer newByteBuffer;
132           if (reuse) {
133             newByteBuffer = buf.slice();
134           } else {
135            newByteBuffer = ByteBuffer.allocate(buf.limit());
136            newByteBuffer.put(buf);
137           }
138           buf.position(buf.limit());
139           buf.limit(buf.limit() + HFileBlock.EXTRA_SERIALIZATION_SPACE);
140           int minorVersion=buf.getInt();
141           HFileBlock ourBuffer = new HFileBlock(newByteBuffer, minorVersion);
142           ourBuffer.offset = buf.getLong();
143           ourBuffer.nextBlockOnDiskSizeWithHeader = buf.getInt();
144           return ourBuffer;
145         }
146         
147         @Override
148         public int getDeserialiserIdentifier() {
149           return deserializerIdentifier;
150         }
151 
152         @Override
153         public HFileBlock deserialize(ByteBuffer b) throws IOException {
154           return deserialize(b, false);
155         }
156       };
157   private static final int deserializerIdentifier;
158   static {
159     deserializerIdentifier = CacheableDeserializerIdManager
160         .registerDeserializer(blockDeserializer);
161   }
162 
163   private BlockType blockType;
164 
165   /** Size on disk without the header. It includes checksum data too. */
166   private int onDiskSizeWithoutHeader;
167 
168   /** Size of pure data. Does not include header or checksums */
169   private final int uncompressedSizeWithoutHeader;
170 
171   /** The offset of the previous block on disk */
172   private final long prevBlockOffset;
173 
174   /** The Type of checksum, better to store the byte than an object */
175   private final byte checksumType;
176 
177   /** The number of bytes for which a checksum is computed */
178   private final int bytesPerChecksum;
179 
180   /** Size on disk of header and data. Does not include checksum data */
181   private final int onDiskDataSizeWithHeader;
182 
183   /** The minor version of the hfile. */
184   private final int minorVersion;
185 
186   /** The in-memory representation of the hfile block */
187   private ByteBuffer buf;
188 
189   /** Whether there is a memstore timestamp after every key/value */
190   private boolean includesMemstoreTS;
191 
192   /**
193    * The offset of this block in the file. Populated by the reader for
194    * convenience of access. This offset is not part of the block header.
195    */
196   private long offset = -1;
197 
198   /**
199    * The on-disk size of the next block, including the header, obtained by
200    * peeking into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the next block's
201    * header, or -1 if unknown.
202    */
203   private int nextBlockOnDiskSizeWithHeader = -1;
204 
205   /**
206    * Creates a new {@link HFile} block from the given fields. This constructor
207    * is mostly used when the block data has already been read and uncompressed,
208    * and is sitting in a byte buffer. 
209    *
210    * @param blockType the type of this block, see {@link BlockType}
211    * @param onDiskSizeWithoutHeader compressed size of the block if compression
212    *          is used, otherwise uncompressed size, header size not included
213    * @param uncompressedSizeWithoutHeader uncompressed size of the block,
214    *          header size not included. Equals onDiskSizeWithoutHeader if
215    *          compression is disabled.
216    * @param prevBlockOffset the offset of the previous block in the
217    *          {@link HFile}
218    * @param buf block header ({@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes) followed by
219    *          uncompressed data. This
220    * @param fillHeader true to fill in the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of
221    *          the buffer based on the header fields provided
222    * @param offset the file offset the block was read from
223    * @param minorVersion the minor version of this block
224    * @param bytesPerChecksum the number of bytes per checksum chunk
225    * @param checksumType the checksum algorithm to use
226    * @param onDiskDataSizeWithHeader size of header and data on disk not
227    *        including checksum data
228    */
229   HFileBlock(BlockType blockType, int onDiskSizeWithoutHeader,
230       int uncompressedSizeWithoutHeader, long prevBlockOffset, ByteBuffer buf,
231       boolean fillHeader, long offset, boolean includesMemstoreTS, 
232       int minorVersion, int bytesPerChecksum, byte checksumType,
233       int onDiskDataSizeWithHeader) {
234     this.blockType = blockType;
235     this.onDiskSizeWithoutHeader = onDiskSizeWithoutHeader;
236     this.uncompressedSizeWithoutHeader = uncompressedSizeWithoutHeader;
237     this.prevBlockOffset = prevBlockOffset;
238     this.buf = buf;
239     if (fillHeader)
240       overwriteHeader();
241     this.offset = offset;
242     this.includesMemstoreTS = includesMemstoreTS;
243     this.minorVersion = minorVersion;
244     this.bytesPerChecksum = bytesPerChecksum;
245     this.checksumType = checksumType;
246     this.onDiskDataSizeWithHeader = onDiskDataSizeWithHeader;
247   }
248 
249   /**
250    * Creates a block from an existing buffer starting with a header. Rewinds
251    * and takes ownership of the buffer. By definition of rewind, ignores the
252    * buffer position, but if you slice the buffer beforehand, it will rewind
253    * to that point. The reason this has a minorNumber and not a majorNumber is
254    * because majorNumbers indicate the format of a HFile whereas minorNumbers 
255    * indicate the format inside a HFileBlock.
256    */
257   HFileBlock(ByteBuffer b, int minorVersion) throws IOException {
258     b.rewind();
259     blockType = BlockType.read(b);
260     onDiskSizeWithoutHeader = b.getInt();
261     uncompressedSizeWithoutHeader = b.getInt();
262     prevBlockOffset = b.getLong();
263     this.minorVersion = minorVersion;
264     if (minorVersion >= MINOR_VERSION_WITH_CHECKSUM) {
265       this.checksumType = b.get();
266       this.bytesPerChecksum = b.getInt();
267       this.onDiskDataSizeWithHeader = b.getInt();
268     } else {
269       this.checksumType = ChecksumType.NULL.getCode();
270       this.bytesPerChecksum = 0;
271       this.onDiskDataSizeWithHeader = onDiskSizeWithoutHeader +
272                                        HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM;
273     }
274     buf = b;
275     buf.rewind();
276   }
277 
278   public BlockType getBlockType() {
279     return blockType;
280   }
281 
282   /** @return get data block encoding id that was used to encode this block */
283   public short getDataBlockEncodingId() {
284     if (blockType != BlockType.ENCODED_DATA) {
285       throw new IllegalArgumentException("Querying encoder ID of a block " +
286           "of type other than " + BlockType.ENCODED_DATA + ": " + blockType);
287     }
288     return buf.getShort(headerSize());
289   }
290 
291   /**
292    * @return the on-disk size of the block with header size included. This
293    * includes the header, the data and the checksum data.
294    */
295   public int getOnDiskSizeWithHeader() {
296     return onDiskSizeWithoutHeader + headerSize();
297   }
298 
299   /**
300    * Returns the size of the compressed part of the block in case compression
301    * is used, or the uncompressed size of the data part otherwise. Header size
302    * and checksum data size is not included.
303    *
304    * @return the on-disk size of the data part of the block, header and
305    *         checksum not included. 
306    */
307   public int getOnDiskSizeWithoutHeader() {
308     return onDiskSizeWithoutHeader;
309   }
310 
311   /**
312    * @return the uncompressed size of the data part of the block, header not
313    *         included
314    */
315    public int getUncompressedSizeWithoutHeader() {
316     return uncompressedSizeWithoutHeader;
317   }
318 
319   /**
320    * @return the offset of the previous block of the same type in the file, or
321    *         -1 if unknown
322    */
323   public long getPrevBlockOffset() {
324     return prevBlockOffset;
325   }
326 
327   /**
328    * Writes header fields into the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the
329    * buffer. Resets the buffer position to the end of header as side effect.
330    */
331   private void overwriteHeader() {
332     buf.rewind();
333     blockType.write(buf);
334     buf.putInt(onDiskSizeWithoutHeader);
335     buf.putInt(uncompressedSizeWithoutHeader);
336     buf.putLong(prevBlockOffset);
337   }
338 
339   /**
340    * Returns a buffer that does not include the header. The array offset points
341    * to the start of the block data right after the header. The underlying data
342    * array is not copied. Checksum data is not included in the returned buffer.
343    *
344    * @return the buffer with header skipped
345    */
346   public ByteBuffer getBufferWithoutHeader() {
347     return ByteBuffer.wrap(buf.array(), buf.arrayOffset() + headerSize(),
348         buf.limit() - headerSize() - totalChecksumBytes()).slice();
349   }
350 
351   /**
352    * Returns the buffer this block stores internally. The clients must not
353    * modify the buffer object. This method has to be public because it is
354    * used in {@link CompoundBloomFilter} to avoid object creation on every
355    * Bloom filter lookup, but has to be used with caution. Checksum data
356    * is not included in the returned buffer.
357    *
358    * @return the buffer of this block for read-only operations
359    */
360   public ByteBuffer getBufferReadOnly() {
361     return ByteBuffer.wrap(buf.array(), buf.arrayOffset(),
362         buf.limit() - totalChecksumBytes()).slice();
363   }
364 
365   /**
366    * Returns the buffer of this block, including header data. The clients must
367    * not modify the buffer object. This method has to be public because it is
368    * used in {@link BucketCache} to avoid buffer copy.
369    * 
370    * @return the byte buffer with header included for read-only operations
371    */
372   public ByteBuffer getBufferReadOnlyWithHeader() {
373     return ByteBuffer.wrap(buf.array(), buf.arrayOffset(), buf.limit()).slice();
374   }
375 
376   /**
377    * Returns a byte buffer of this block, including header data, positioned at
378    * the beginning of header. The underlying data array is not copied.
379    *
380    * @return the byte buffer with header included
381    */
382   ByteBuffer getBufferWithHeader() {
383     ByteBuffer dupBuf = buf.duplicate();
384     dupBuf.rewind();
385     return dupBuf;
386   }
387 
388   private void sanityCheckAssertion(long valueFromBuf, long valueFromField,
389       String fieldName) throws IOException {
390     if (valueFromBuf != valueFromField) {
391       throw new AssertionError(fieldName + " in the buffer (" + valueFromBuf
392           + ") is different from that in the field (" + valueFromField + ")");
393     }
394   }
395 
396   /**
397    * Checks if the block is internally consistent, i.e. the first
398    * {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes of the buffer contain a valid header consistent
399    * with the fields. This function is primary for testing and debugging, and
400    * is not thread-safe, because it alters the internal buffer pointer.
401    */
402   void sanityCheck() throws IOException {
403     buf.rewind();
404 
405     {
406       BlockType blockTypeFromBuf = BlockType.read(buf);
407       if (blockTypeFromBuf != blockType) {
408         throw new IOException("Block type stored in the buffer: " +
409             blockTypeFromBuf + ", block type field: " + blockType);
410       }
411     }
412 
413     sanityCheckAssertion(buf.getInt(), onDiskSizeWithoutHeader,
414         "onDiskSizeWithoutHeader");
415 
416     sanityCheckAssertion(buf.getInt(), uncompressedSizeWithoutHeader,
417         "uncompressedSizeWithoutHeader");
418 
419     sanityCheckAssertion(buf.getLong(), prevBlockOffset, "prevBlocKOffset");
420     if (minorVersion >= MINOR_VERSION_WITH_CHECKSUM) {
421       sanityCheckAssertion(buf.get(), checksumType, "checksumType");
422       sanityCheckAssertion(buf.getInt(), bytesPerChecksum, "bytesPerChecksum");
423       sanityCheckAssertion(buf.getInt(), onDiskDataSizeWithHeader, 
424                            "onDiskDataSizeWithHeader");
425     }
426 
427     int cksumBytes = totalChecksumBytes();
428     int hdrSize = headerSize();
429     int expectedBufLimit = uncompressedSizeWithoutHeader + headerSize() +
430                            cksumBytes;
431     if (buf.limit() != expectedBufLimit) {
432       throw new AssertionError("Expected buffer limit " + expectedBufLimit
433           + ", got " + buf.limit());
434     }
435 
436     // We might optionally allocate HFILEBLOCK_HEADER_SIZE more bytes to read the next
437     // block's, header, so there are two sensible values for buffer capacity.
438     int size = uncompressedSizeWithoutHeader + hdrSize + cksumBytes;
439     if (buf.capacity() != size &&
440         buf.capacity() != size + hdrSize) {
441       throw new AssertionError("Invalid buffer capacity: " + buf.capacity() +
442           ", expected " + size + " or " + (size + hdrSize));
443     }
444   }
445 
446   @Override
447   public String toString() {
448     return "blockType="
449         + blockType
450         + ", onDiskSizeWithoutHeader="
451         + onDiskSizeWithoutHeader
452         + ", uncompressedSizeWithoutHeader="
453         + uncompressedSizeWithoutHeader
454         + ", prevBlockOffset="
455         + prevBlockOffset
456         + ", dataBeginsWith="
457         + Bytes.toStringBinary(buf.array(), buf.arrayOffset() + headerSize(),
458             Math.min(32, buf.limit() - buf.arrayOffset() - headerSize()))
459         + ", fileOffset=" + offset;
460   }
461 
462   private void validateOnDiskSizeWithoutHeader(
463       int expectedOnDiskSizeWithoutHeader) throws IOException {
464     if (onDiskSizeWithoutHeader != expectedOnDiskSizeWithoutHeader) {
465       String blockInfoMsg =
466         "Block offset: " + offset + ", data starts with: "
467           + Bytes.toStringBinary(buf.array(), buf.arrayOffset(),
468               buf.arrayOffset() + Math.min(32, buf.limit()));
469       throw new IOException("On-disk size without header provided is "
470           + expectedOnDiskSizeWithoutHeader + ", but block "
471           + "header contains " + onDiskSizeWithoutHeader + ". " +
472           blockInfoMsg);
473     }
474   }
475 
476   /**
477    * Always allocates a new buffer of the correct size. Copies header bytes
478    * from the existing buffer. Does not change header fields. 
479    * Reserve room to keep checksum bytes too.
480    *
481    * @param extraBytes whether to reserve room in the buffer to read the next
482    *          block's header
483    */
484   private void allocateBuffer(boolean extraBytes) {
485     int cksumBytes = totalChecksumBytes();
486     int capacityNeeded = headerSize() + uncompressedSizeWithoutHeader +
487         cksumBytes +
488         (extraBytes ? headerSize() : 0);
489 
490     ByteBuffer newBuf = ByteBuffer.allocate(capacityNeeded);
491 
492     // Copy header bytes.
493     System.arraycopy(buf.array(), buf.arrayOffset(), newBuf.array(),
494         newBuf.arrayOffset(), headerSize());
495 
496     buf = newBuf;
497     buf.limit(headerSize() + uncompressedSizeWithoutHeader + cksumBytes);
498   }
499 
500   /** An additional sanity-check in case no compression is being used. */
501   public void assumeUncompressed() throws IOException {
502     if (onDiskSizeWithoutHeader != uncompressedSizeWithoutHeader + 
503         totalChecksumBytes()) {
504       throw new IOException("Using no compression but "
505           + "onDiskSizeWithoutHeader=" + onDiskSizeWithoutHeader + ", "
506           + "uncompressedSizeWithoutHeader=" + uncompressedSizeWithoutHeader
507           + ", numChecksumbytes=" + totalChecksumBytes());
508     }
509   }
510 
511   /**
512    * @param expectedType the expected type of this block
513    * @throws IOException if this block's type is different than expected
514    */
515   public void expectType(BlockType expectedType) throws IOException {
516     if (blockType != expectedType) {
517       throw new IOException("Invalid block type: expected=" + expectedType
518           + ", actual=" + blockType);
519     }
520   }
521 
522   /** @return the offset of this block in the file it was read from */
523   public long getOffset() {
524     if (offset < 0) {
525       throw new IllegalStateException(
526           "HFile block offset not initialized properly");
527     }
528     return offset;
529   }
530 
531   /**
532    * @return a byte stream reading the data section of this block
533    */
534   public DataInputStream getByteStream() {
535     return new DataInputStream(new ByteArrayInputStream(buf.array(),
536         buf.arrayOffset() + headerSize(), buf.limit() - headerSize()));
537   }
538 
539   @Override
540   public long heapSize() {
541     long size = ClassSize.align(
542         ClassSize.OBJECT +
543         // Block type and byte buffer references
544         2 * ClassSize.REFERENCE +
545         // On-disk size, uncompressed size, and next block's on-disk size
546         // bytePerChecksum,  onDiskDataSize and minorVersion
547         6 * Bytes.SIZEOF_INT +
548         // Checksum type
549         1 * Bytes.SIZEOF_BYTE +
550         // This and previous block offset
551         2 * Bytes.SIZEOF_LONG +
552         // "Include memstore timestamp" flag
553         Bytes.SIZEOF_BOOLEAN
554     );
555 
556     if (buf != null) {
557       // Deep overhead of the byte buffer. Needs to be aligned separately.
558       size += ClassSize.align(buf.capacity() + BYTE_BUFFER_HEAP_SIZE);
559     }
560 
561     return ClassSize.align(size);
562   }
563 
564   /**
565    * Read from an input stream. Analogous to
566    * {@link IOUtils#readFully(InputStream, byte[], int, int)}, but specifies a
567    * number of "extra" bytes that would be desirable but not absolutely
568    * necessary to read.
569    *
570    * @param in the input stream to read from
571    * @param buf the buffer to read into
572    * @param bufOffset the destination offset in the buffer
573    * @param necessaryLen the number of bytes that are absolutely necessary to
574    *          read
575    * @param extraLen the number of extra bytes that would be nice to read
576    * @return true if succeeded reading the extra bytes
577    * @throws IOException if failed to read the necessary bytes
578    */
579   public static boolean readWithExtra(InputStream in, byte buf[],
580       int bufOffset, int necessaryLen, int extraLen) throws IOException {
581     int bytesRemaining = necessaryLen + extraLen;
582     while (bytesRemaining > 0) {
583       int ret = in.read(buf, bufOffset, bytesRemaining);
584       if (ret == -1 && bytesRemaining <= extraLen) {
585         // We could not read the "extra data", but that is OK.
586         break;
587       }
588 
589       if (ret < 0) {
590         throw new IOException("Premature EOF from inputStream (read "
591             + "returned " + ret + ", was trying to read " + necessaryLen
592             + " necessary bytes and " + extraLen + " extra bytes, "
593             + "successfully read "
594             + (necessaryLen + extraLen - bytesRemaining));
595       }
596       bufOffset += ret;
597       bytesRemaining -= ret;
598     }
599     return bytesRemaining <= 0;
600   }
601 
602   /**
603    * @return the on-disk size of the next block (including the header size)
604    *         that was read by peeking into the next block's header
605    */
606   public int getNextBlockOnDiskSizeWithHeader() {
607     return nextBlockOnDiskSizeWithHeader;
608   }
609 
610 
611   /**
612    * Unified version 2 {@link HFile} block writer. The intended usage pattern
613    * is as follows:
614    * <ol>
615    * <li>Construct an {@link HFileBlock.Writer}, providing a compression algorithm.
616    * <li>Call {@link Writer#startWriting} and get a data stream to write to.
617    * <li>Write your data into the stream.
618    * <li>Call {@link Writer#writeHeaderAndData(FSDataOutputStream)} as many times as you need to.
619    * store the serialized block into an external stream.
620    * <li>Repeat to write more blocks.
621    * </ol>
622    * <p>
623    */
624   public static class Writer {
625 
626     private enum State {
627       INIT,
628       WRITING,
629       BLOCK_READY
630     };
631 
632     /** Writer state. Used to ensure the correct usage protocol. */
633     private State state = State.INIT;
634 
635     /** Data block encoder used for data blocks */
636     private final HFileDataBlockEncoder dataBlockEncoder;
637 
638     private HFileBlockEncodingContext dataBlockEncodingCtx;
639 
640     /** block encoding context for non-data blocks */
641     private HFileBlockDefaultEncodingContext defaultBlockEncodingCtx;
642 
643     /**
644      * The stream we use to accumulate data in uncompressed format for each
645      * block. We reset this stream at the end of each block and reuse it. The
646      * header is written as the first {@link HConstants#HFILEBLOCK_HEADER_SIZE} bytes into this
647      * stream.
648      */
649     private ByteArrayOutputStream baosInMemory;
650 
651     /**
652      * Current block type. Set in {@link #startWriting(BlockType)}. Could be
653      * changed in {@link #encodeDataBlockForDisk()} from {@link BlockType#DATA}
654      * to {@link BlockType#ENCODED_DATA}.
655      */
656     private BlockType blockType;
657 
658     /**
659      * A stream that we write uncompressed bytes to, which compresses them and
660      * writes them to {@link #baosInMemory}.
661      */
662     private DataOutputStream userDataStream;
663 
664     /**
665      * Bytes to be written to the file system, including the header. Compressed
666      * if compression is turned on. It also includes the checksum data that 
667      * immediately follows the block data. (header + data + checksums)
668      */
669     private byte[] onDiskBytesWithHeader;
670 
671     /**
672      * The size of the checksum data on disk. It is used only if data is
673      * not compressed. If data is compressed, then the checksums are already
674      * part of onDiskBytesWithHeader. If data is uncompressed, then this
675      * variable stores the checksum data for this block.
676      */
677     private byte[] onDiskChecksum;
678 
679     /**
680      * Valid in the READY state. Contains the header and the uncompressed (but
681      * potentially encoded, if this is a data block) bytes, so the length is
682      * {@link #uncompressedSizeWithoutHeader} + {@link org.apache.hadoop.hbase.HConstants#HFILEBLOCK_HEADER_SIZE}.
683      * Does not store checksums.
684      */
685     private byte[] uncompressedBytesWithHeader;
686 
687     /**
688      * Current block's start offset in the {@link HFile}. Set in
689      * {@link #writeHeaderAndData(FSDataOutputStream)}.
690      */
691     private long startOffset;
692 
693     /**
694      * Offset of previous block by block type. Updated when the next block is
695      * started.
696      */
697     private long[] prevOffsetByType;
698 
699     /** The offset of the previous block of the same type */
700     private long prevOffset;
701 
702     /** Whether we are including memstore timestamp after every key/value */
703     private boolean includesMemstoreTS;
704 
705     /** Checksum settings */
706     private ChecksumType checksumType;
707     private int bytesPerChecksum;
708 
709     /**
710      * @param compressionAlgorithm compression algorithm to use
711      * @param dataBlockEncoder data block encoding algorithm to use
712      * @param checksumType type of checksum
713      * @param bytesPerChecksum bytes per checksum
714      */
715     public Writer(Compression.Algorithm compressionAlgorithm,
716           HFileDataBlockEncoder dataBlockEncoder, boolean includesMemstoreTS,
717           ChecksumType checksumType, int bytesPerChecksum) {
718       this.dataBlockEncoder = dataBlockEncoder != null
719           ? dataBlockEncoder : NoOpDataBlockEncoder.INSTANCE;
720       defaultBlockEncodingCtx =
721         new HFileBlockDefaultEncodingContext(compressionAlgorithm, null, HConstants.HFILEBLOCK_DUMMY_HEADER);
722       dataBlockEncodingCtx =
723         this.dataBlockEncoder.newOnDiskDataBlockEncodingContext(
724             compressionAlgorithm, HConstants.HFILEBLOCK_DUMMY_HEADER);
725 
726       if (bytesPerChecksum < HConstants.HFILEBLOCK_HEADER_SIZE) {
727         throw new RuntimeException("Unsupported value of bytesPerChecksum. " +
728             " Minimum is " + HConstants.HFILEBLOCK_HEADER_SIZE + " but the configured value is " +
729             bytesPerChecksum);
730       }
731 
732       baosInMemory = new ByteArrayOutputStream();
733       
734       prevOffsetByType = new long[BlockType.values().length];
735       for (int i = 0; i < prevOffsetByType.length; ++i)
736         prevOffsetByType[i] = -1;
737 
738       this.includesMemstoreTS = includesMemstoreTS;
739       this.checksumType = checksumType;
740       this.bytesPerChecksum = bytesPerChecksum;
741     }
742 
743     /**
744      * Starts writing into the block. The previous block's data is discarded.
745      *
746      * @return the stream the user can write their data into
747      * @throws IOException
748      */
749     public DataOutputStream startWriting(BlockType newBlockType)
750         throws IOException {
751       if (state == State.BLOCK_READY && startOffset != -1) {
752         // We had a previous block that was written to a stream at a specific
753         // offset. Save that offset as the last offset of a block of that type.
754         prevOffsetByType[blockType.getId()] = startOffset;
755       }
756 
757       startOffset = -1;
758       blockType = newBlockType;
759 
760       baosInMemory.reset();
761       baosInMemory.write(HConstants.HFILEBLOCK_DUMMY_HEADER);
762 
763       state = State.WRITING;
764 
765       // We will compress it later in finishBlock()
766       userDataStream = new DataOutputStream(baosInMemory);
767       return userDataStream;
768     }
769 
770     /**
771      * Returns the stream for the user to write to. The block writer takes care
772      * of handling compression and buffering for caching on write. Can only be
773      * called in the "writing" state.
774      *
775      * @return the data output stream for the user to write to
776      */
777     DataOutputStream getUserDataStream() {
778       expectState(State.WRITING);
779       return userDataStream;
780     }
781 
782     /**
783      * Transitions the block writer from the "writing" state to the "block
784      * ready" state.  Does nothing if a block is already finished.
785      */
786     private void ensureBlockReady() throws IOException {
787       Preconditions.checkState(state != State.INIT,
788           "Unexpected state: " + state);
789 
790       if (state == State.BLOCK_READY)
791         return;
792 
793       // This will set state to BLOCK_READY.
794       finishBlock();
795     }
796 
797     /**
798      * An internal method that flushes the compressing stream (if using
799      * compression), serializes the header, and takes care of the separate
800      * uncompressed stream for caching on write, if applicable. Sets block
801      * write state to "block ready".
802      */
803     private void finishBlock() throws IOException {
804       userDataStream.flush();
805       // This does an array copy, so it is safe to cache this byte array.
806       uncompressedBytesWithHeader = baosInMemory.toByteArray();
807       prevOffset = prevOffsetByType[blockType.getId()];
808 
809       // We need to set state before we can package the block up for
810       // cache-on-write. In a way, the block is ready, but not yet encoded or
811       // compressed.
812       state = State.BLOCK_READY;
813       if (blockType == BlockType.DATA) {
814         encodeDataBlockForDisk();
815       } else {
816         defaultBlockEncodingCtx.compressAfterEncodingWithBlockType(
817             uncompressedBytesWithHeader, blockType);
818         onDiskBytesWithHeader =
819           defaultBlockEncodingCtx.getOnDiskBytesWithHeader();
820       }
821 
822       int numBytes = (int) ChecksumUtil.numBytes(
823           onDiskBytesWithHeader.length,
824           bytesPerChecksum);
825 
826       // put the header for on disk bytes
827       putHeader(onDiskBytesWithHeader, 0,
828           onDiskBytesWithHeader.length + numBytes,
829           uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length);
830       // set the header for the uncompressed bytes (for cache-on-write)
831       putHeader(uncompressedBytesWithHeader, 0,
832           onDiskBytesWithHeader.length + numBytes,
833           uncompressedBytesWithHeader.length, onDiskBytesWithHeader.length);
834 
835       onDiskChecksum = new byte[numBytes];
836       ChecksumUtil.generateChecksums(
837           onDiskBytesWithHeader, 0, onDiskBytesWithHeader.length,
838           onDiskChecksum, 0, checksumType, bytesPerChecksum);
839     }
840 
841     /**
842      * Encodes this block if it is a data block and encoding is turned on in
843      * {@link #dataBlockEncoder}.
844      */
845     private void encodeDataBlockForDisk() throws IOException {
846       // do data block encoding, if data block encoder is set
847       ByteBuffer rawKeyValues =
848           ByteBuffer.wrap(uncompressedBytesWithHeader, HConstants.HFILEBLOCK_HEADER_SIZE,
849               uncompressedBytesWithHeader.length - HConstants.HFILEBLOCK_HEADER_SIZE).slice();
850 
851       //do the encoding
852       dataBlockEncoder.beforeWriteToDisk(rawKeyValues,
853               includesMemstoreTS, dataBlockEncodingCtx, blockType);
854 
855       uncompressedBytesWithHeader =
856           dataBlockEncodingCtx.getUncompressedBytesWithHeader();
857       onDiskBytesWithHeader =
858           dataBlockEncodingCtx.getOnDiskBytesWithHeader();
859       blockType = dataBlockEncodingCtx.getBlockType();
860     }
861 
862     /**
863      * Put the header into the given byte array at the given offset.
864      * @param onDiskSize size of the block on disk header + data + checksum
865      * @param uncompressedSize size of the block after decompression (but
866      *          before optional data block decoding) including header
867      * @param onDiskDataSize size of the block on disk with header
868      *        and data but not including the checksums
869      */
870     private void putHeader(byte[] dest, int offset, int onDiskSize,
871         int uncompressedSize, int onDiskDataSize) {
872       offset = blockType.put(dest, offset);
873       offset = Bytes.putInt(dest, offset, onDiskSize - HConstants.HFILEBLOCK_HEADER_SIZE);
874       offset = Bytes.putInt(dest, offset, uncompressedSize - HConstants.HFILEBLOCK_HEADER_SIZE);
875       offset = Bytes.putLong(dest, offset, prevOffset);
876       offset = Bytes.putByte(dest, offset, checksumType.getCode());
877       offset = Bytes.putInt(dest, offset, bytesPerChecksum);
878       Bytes.putInt(dest, offset, onDiskDataSize);
879     }
880 
881     /**
882      * Similar to {@link #writeHeaderAndData(FSDataOutputStream)}, but records
883      * the offset of this block so that it can be referenced in the next block
884      * of the same type.
885      *
886      * @param out
887      * @throws IOException
888      */
889     public void writeHeaderAndData(FSDataOutputStream out) throws IOException {
890       long offset = out.getPos();
891       if (startOffset != -1 && offset != startOffset) {
892         throw new IOException("A " + blockType + " block written to a "
893             + "stream twice, first at offset " + startOffset + ", then at "
894             + offset);
895       }
896       startOffset = offset;
897 
898       finishBlockAndWriteHeaderAndData((DataOutputStream) out);
899     }
900 
901     /**
902      * Writes the header and the compressed data of this block (or uncompressed
903      * data when not using compression) into the given stream. Can be called in
904      * the "writing" state or in the "block ready" state. If called in the
905      * "writing" state, transitions the writer to the "block ready" state.
906      *
907      * @param out the output stream to write the
908      * @throws IOException
909      */
910     private void finishBlockAndWriteHeaderAndData(DataOutputStream out)
911       throws IOException {
912       ensureBlockReady();
913       out.write(onDiskBytesWithHeader);
914       out.write(onDiskChecksum);
915     }
916 
917     /**
918      * Returns the header or the compressed data (or uncompressed data when not
919      * using compression) as a byte array. Can be called in the "writing" state
920      * or in the "block ready" state. If called in the "writing" state,
921      * transitions the writer to the "block ready" state. This returns
922      * the header + data + checksums stored on disk.
923      *
924      * @return header and data as they would be stored on disk in a byte array
925      * @throws IOException
926      */
927     byte[] getHeaderAndDataForTest() throws IOException {
928       ensureBlockReady();
929       // This is not very optimal, because we are doing an extra copy.
930       // But this method is used only by unit tests.
931       byte[] output =
932           new byte[onDiskBytesWithHeader.length
933               + onDiskChecksum.length];
934       System.arraycopy(onDiskBytesWithHeader, 0, output, 0,
935           onDiskBytesWithHeader.length);
936       System.arraycopy(onDiskChecksum, 0, output,
937           onDiskBytesWithHeader.length, onDiskChecksum.length);
938       return output;
939     }
940 
941     /**
942      * Releases resources used by this writer.
943      */
944     public void release() {
945       if (dataBlockEncodingCtx != null) {
946         dataBlockEncodingCtx.close();
947         dataBlockEncodingCtx = null;
948       }
949       if (defaultBlockEncodingCtx != null) {
950         defaultBlockEncodingCtx.close();
951         defaultBlockEncodingCtx = null;
952       }
953     }
954 
955     /**
956      * Returns the on-disk size of the data portion of the block. This is the
957      * compressed size if compression is enabled. Can only be called in the
958      * "block ready" state. Header is not compressed, and its size is not
959      * included in the return value.
960      *
961      * @return the on-disk size of the block, not including the header.
962      */
963     int getOnDiskSizeWithoutHeader() {
964       expectState(State.BLOCK_READY);
965       return onDiskBytesWithHeader.length + onDiskChecksum.length - HConstants.HFILEBLOCK_HEADER_SIZE;
966     }
967 
968     /**
969      * Returns the on-disk size of the block. Can only be called in the
970      * "block ready" state.
971      *
972      * @return the on-disk size of the block ready to be written, including the
973      *         header size, the data and the checksum data.
974      */
975     int getOnDiskSizeWithHeader() {
976       expectState(State.BLOCK_READY);
977       return onDiskBytesWithHeader.length + onDiskChecksum.length;
978     }
979 
980     /**
981      * The uncompressed size of the block data. Does not include header size.
982      */
983     int getUncompressedSizeWithoutHeader() {
984       expectState(State.BLOCK_READY);
985       return uncompressedBytesWithHeader.length - HConstants.HFILEBLOCK_HEADER_SIZE;
986     }
987 
988     /**
989      * The uncompressed size of the block data, including header size.
990      */
991     int getUncompressedSizeWithHeader() {
992       expectState(State.BLOCK_READY);
993       return uncompressedBytesWithHeader.length;
994     }
995 
996     /** @return true if a block is being written  */
997     public boolean isWriting() {
998       return state == State.WRITING;
999     }
1000 
1001     /**
1002      * Returns the number of bytes written into the current block so far, or
1003      * zero if not writing the block at the moment. Note that this will return
1004      * zero in the "block ready" state as well.
1005      *
1006      * @return the number of bytes written
1007      */
1008     public int blockSizeWritten() {
1009       if (state != State.WRITING)
1010         return 0;
1011       return userDataStream.size();
1012     }
1013 
1014     /**
1015      * Returns the header followed by the uncompressed data, even if using
1016      * compression. This is needed for storing uncompressed blocks in the block
1017      * cache. Can be called in the "writing" state or the "block ready" state.
1018      * Returns only the header and data, does not include checksum data.
1019      *
1020      * @return uncompressed block bytes for caching on write
1021      */
1022     ByteBuffer getUncompressedBufferWithHeader() {
1023       expectState(State.BLOCK_READY);
1024       return ByteBuffer.wrap(uncompressedBytesWithHeader);
1025     }
1026 
1027     private void expectState(State expectedState) {
1028       if (state != expectedState) {
1029         throw new IllegalStateException("Expected state: " + expectedState +
1030             ", actual state: " + state);
1031       }
1032     }
1033 
1034     /**
1035      * Takes the given {@link BlockWritable} instance, creates a new block of
1036      * its appropriate type, writes the writable into this block, and flushes
1037      * the block into the output stream. The writer is instructed not to buffer
1038      * uncompressed bytes for cache-on-write.
1039      *
1040      * @param bw the block-writable object to write as a block
1041      * @param out the file system output stream
1042      * @throws IOException
1043      */
1044     public void writeBlock(BlockWritable bw, FSDataOutputStream out)
1045         throws IOException {
1046       bw.writeToBlock(startWriting(bw.getBlockType()));
1047       writeHeaderAndData(out);
1048     }
1049 
1050     /**
1051      * Creates a new HFileBlock. Checksums have already been validated, so
1052      * the byte buffer passed into the constructor of this newly created
1053      * block does not have checksum data even though the header minor 
1054      * version is MINOR_VERSION_WITH_CHECKSUM. This is indicated by setting a
1055      * 0 value in bytesPerChecksum.
1056      */
1057     public HFileBlock getBlockForCaching() {
1058       return new HFileBlock(blockType, getOnDiskSizeWithoutHeader(),
1059           getUncompressedSizeWithoutHeader(), prevOffset,
1060           getUncompressedBufferWithHeader(), DONT_FILL_HEADER, startOffset,
1061           includesMemstoreTS, MINOR_VERSION_WITH_CHECKSUM,
1062           0, ChecksumType.NULL.getCode(),  // no checksums in cached data
1063           onDiskBytesWithHeader.length + onDiskChecksum.length);
1064     }
1065   }
1066 
1067   /** Something that can be written into a block. */
1068   public interface BlockWritable {
1069 
1070     /** The type of block this data should use. */
1071     BlockType getBlockType();
1072 
1073     /**
1074      * Writes the block to the provided stream. Must not write any magic
1075      * records.
1076      *
1077      * @param out a stream to write uncompressed data into
1078      */
1079     void writeToBlock(DataOutput out) throws IOException;
1080   }
1081 
1082   // Block readers and writers
1083 
1084   /** An interface allowing to iterate {@link HFileBlock}s. */
1085   public interface BlockIterator {
1086 
1087     /**
1088      * Get the next block, or null if there are no more blocks to iterate.
1089      */
1090     HFileBlock nextBlock() throws IOException;
1091 
1092     /**
1093      * Similar to {@link #nextBlock()} but checks block type, throws an
1094      * exception if incorrect, and returns the HFile block
1095      */
1096     HFileBlock nextBlockWithBlockType(BlockType blockType) throws IOException;
1097   }
1098 
1099   /** A full-fledged reader with iteration ability. */
1100   public interface FSReader {
1101 
1102     /**
1103      * Reads the block at the given offset in the file with the given on-disk
1104      * size and uncompressed size.
1105      *
1106      * @param offset
1107      * @param onDiskSize the on-disk size of the entire block, including all
1108      *          applicable headers, or -1 if unknown
1109      * @param uncompressedSize the uncompressed size of the compressed part of
1110      *          the block, or -1 if unknown
1111      * @return the newly read block
1112      */
1113     HFileBlock readBlockData(long offset, long onDiskSize,
1114         int uncompressedSize, boolean pread) throws IOException;
1115 
1116     /**
1117      * Creates a block iterator over the given portion of the {@link HFile}.
1118      * The iterator returns blocks starting with offset such that offset <=
1119      * startOffset < endOffset.
1120      *
1121      * @param startOffset the offset of the block to start iteration with
1122      * @param endOffset the offset to end iteration at (exclusive)
1123      * @return an iterator of blocks between the two given offsets
1124      */
1125     BlockIterator blockRange(long startOffset, long endOffset);
1126 
1127     /** Closes the backing streams */
1128     void closeStreams() throws IOException;
1129   }
1130 
1131   /**
1132    * A common implementation of some methods of {@link FSReader} and some
1133    * tools for implementing HFile format version-specific block readers.
1134    */
1135   private abstract static class AbstractFSReader implements FSReader {
1136     /** Compression algorithm used by the {@link HFile} */
1137     protected Compression.Algorithm compressAlgo;
1138 
1139     /** The size of the file we are reading from, or -1 if unknown. */
1140     protected long fileSize;
1141 
1142     /** The minor version of this reader */
1143     private int minorVersion;
1144 
1145     /** The size of the header */
1146     protected final int hdrSize;
1147 
1148     /** The filesystem used to access data */
1149     protected HFileSystem hfs;
1150 
1151     /** The path (if any) where this data is coming from */
1152     protected Path path;
1153 
1154     private final Lock streamLock = new ReentrantLock();
1155 
1156     /** The default buffer size for our buffered streams */
1157     public static final int DEFAULT_BUFFER_SIZE = 1 << 20;
1158 
1159     public AbstractFSReader(Algorithm compressAlgo, long fileSize, int minorVersion,
1160         HFileSystem hfs, Path path) throws IOException {
1161       this.compressAlgo = compressAlgo;
1162       this.fileSize = fileSize;
1163       this.minorVersion = minorVersion;
1164       this.hfs = hfs;
1165       this.path = path;
1166       this.hdrSize = headerSize(minorVersion);
1167     }
1168 
1169     @Override
1170     public BlockIterator blockRange(final long startOffset,
1171         final long endOffset) {
1172       return new BlockIterator() {
1173         private long offset = startOffset;
1174 
1175         @Override
1176         public HFileBlock nextBlock() throws IOException {
1177           if (offset >= endOffset)
1178             return null;
1179           HFileBlock b = readBlockData(offset, -1, -1, false);
1180           offset += b.getOnDiskSizeWithHeader();
1181           return b;
1182         }
1183 
1184         @Override
1185         public HFileBlock nextBlockWithBlockType(BlockType blockType)
1186             throws IOException {
1187           HFileBlock blk = nextBlock();
1188           if (blk.getBlockType() != blockType) {
1189             throw new IOException("Expected block of type " + blockType
1190                 + " but found " + blk.getBlockType());
1191           }
1192           return blk;
1193         }
1194       };
1195     }
1196 
1197     /**
1198      * Does a positional read or a seek and read into the given buffer. Returns
1199      * the on-disk size of the next block, or -1 if it could not be determined.
1200      *
1201      * @param dest destination buffer
1202      * @param destOffset offset in the destination buffer
1203      * @param size size of the block to be read
1204      * @param peekIntoNextBlock whether to read the next block's on-disk size
1205      * @param fileOffset position in the stream to read at
1206      * @param pread whether we should do a positional read
1207      * @param istream The input source of data
1208      * @return the on-disk size of the next block with header size included, or
1209      *         -1 if it could not be determined
1210      * @throws IOException
1211      */
1212     protected int readAtOffset(FSDataInputStream istream,
1213         byte[] dest, int destOffset, int size,
1214         boolean peekIntoNextBlock, long fileOffset, boolean pread)
1215         throws IOException {
1216       if (peekIntoNextBlock &&
1217           destOffset + size + hdrSize > dest.length) {
1218         // We are asked to read the next block's header as well, but there is
1219         // not enough room in the array.
1220         throw new IOException("Attempted to read " + size + " bytes and " +
1221             hdrSize + " bytes of next header into a " + dest.length +
1222             "-byte array at offset " + destOffset);
1223       }
1224 
1225       if (!pread && streamLock.tryLock()) {
1226         // Seek + read. Better for scanning.
1227         try {
1228           istream.seek(fileOffset);
1229 
1230           long realOffset = istream.getPos();
1231           if (realOffset != fileOffset) {
1232             throw new IOException("Tried to seek to " + fileOffset + " to "
1233                 + "read " + size + " bytes, but pos=" + realOffset
1234                 + " after seek");
1235           }
1236 
1237           if (!peekIntoNextBlock) {
1238             IOUtils.readFully(istream, dest, destOffset, size);
1239             return -1;
1240           }
1241 
1242           // Try to read the next block header.
1243           if (!readWithExtra(istream, dest, destOffset, size, hdrSize))
1244             return -1;
1245         } finally {
1246           streamLock.unlock();
1247         }
1248       } else {
1249         // Positional read. Better for random reads; or when the streamLock is already locked.
1250         int extraSize = peekIntoNextBlock ? hdrSize : 0;
1251 
1252         int ret = istream.read(fileOffset, dest, destOffset, size + extraSize);
1253         if (ret < size) {
1254           throw new IOException("Positional read of " + size + " bytes " +
1255               "failed at offset " + fileOffset + " (returned " + ret + ")");
1256         }
1257 
1258         if (ret == size || ret < size + extraSize) {
1259           // Could not read the next block's header, or did not try.
1260           return -1;
1261         }
1262       }
1263 
1264       assert peekIntoNextBlock;
1265       return Bytes.toInt(dest, destOffset + size + BlockType.MAGIC_LENGTH) +
1266           hdrSize;
1267     }
1268 
1269     /**
1270      * @return The minorVersion of this HFile
1271      */
1272     protected int getMinorVersion() {
1273       return minorVersion;
1274     }
1275   }
1276 
1277   /**
1278    * We always prefetch the header of the next block, so that we know its
1279    * on-disk size in advance and can read it in one operation.
1280    */
1281   private static class PrefetchedHeader {
1282     long offset = -1;
1283     byte[] header = new byte[HConstants.HFILEBLOCK_HEADER_SIZE];
1284     ByteBuffer buf = ByteBuffer.wrap(header, 0, HConstants.HFILEBLOCK_HEADER_SIZE);
1285   }
1286 
1287   /** Reads version 2 blocks from the filesystem. */
1288   static class FSReaderV2 extends AbstractFSReader {
1289     /** The file system stream of the underlying {@link HFile} that 
1290      * does or doesn't do checksum validations in the filesystem */
1291     protected FSDataInputStreamWrapper streamWrapper;
1292 
1293     /** Whether we include memstore timestamp in data blocks */
1294     protected boolean includesMemstoreTS;
1295 
1296     /** Data block encoding used to read from file */
1297     protected HFileDataBlockEncoder dataBlockEncoder =
1298         NoOpDataBlockEncoder.INSTANCE;
1299 
1300     private HFileBlockDecodingContext encodedBlockDecodingCtx;
1301 
1302     private HFileBlockDefaultDecodingContext defaultDecodingCtx;
1303 
1304     private ThreadLocal<PrefetchedHeader> prefetchedHeaderForThread =
1305         new ThreadLocal<PrefetchedHeader>() {
1306           @Override
1307           public PrefetchedHeader initialValue() {
1308             return new PrefetchedHeader();
1309           }
1310         };
1311 
1312     public FSReaderV2(FSDataInputStreamWrapper stream, Algorithm compressAlgo, long fileSize,
1313         int minorVersion, HFileSystem hfs, Path path) throws IOException {
1314       super(compressAlgo, fileSize, minorVersion, hfs, path);
1315       this.streamWrapper = stream;
1316       // Older versions of HBase didn't support checksum.
1317       boolean forceNoHBaseChecksum = (this.getMinorVersion() < MINOR_VERSION_WITH_CHECKSUM);
1318       this.streamWrapper.prepareForBlockReader(forceNoHBaseChecksum);
1319 
1320       defaultDecodingCtx =
1321         new HFileBlockDefaultDecodingContext(compressAlgo);
1322       encodedBlockDecodingCtx =
1323           new HFileBlockDefaultDecodingContext(compressAlgo);
1324     }
1325 
1326     /**
1327      * A constructor that reads files with the latest minor version.
1328      * This is used by unit tests only.
1329      */
1330     FSReaderV2(FSDataInputStream istream, Algorithm compressAlgo,
1331         long fileSize) throws IOException {
1332       this(new FSDataInputStreamWrapper(istream), compressAlgo, fileSize,
1333            HFileReaderV2.MAX_MINOR_VERSION, null, null);
1334     }
1335 
1336     /**
1337      * Reads a version 2 block. Tries to do as little memory allocation as
1338      * possible, using the provided on-disk size.
1339      *
1340      * @param offset the offset in the stream to read at
1341      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
1342      *          the header, or -1 if unknown
1343      * @param uncompressedSize the uncompressed size of the the block. Always
1344      *          expected to be -1. This parameter is only used in version 1.
1345      * @param pread whether to use a positional read
1346      */
1347     @Override
1348     public HFileBlock readBlockData(long offset, long onDiskSizeWithHeaderL,
1349         int uncompressedSize, boolean pread) throws IOException {
1350 
1351       // get a copy of the current state of whether to validate
1352       // hbase checksums or not for this read call. This is not 
1353       // thread-safe but the one constaint is that if we decide 
1354       // to skip hbase checksum verification then we are 
1355       // guaranteed to use hdfs checksum verification.
1356       boolean doVerificationThruHBaseChecksum = streamWrapper.shouldUseHBaseChecksum();
1357       FSDataInputStream is = streamWrapper.getStream(doVerificationThruHBaseChecksum);
1358 
1359       HFileBlock blk = readBlockDataInternal(is, offset, 
1360                          onDiskSizeWithHeaderL, 
1361                          uncompressedSize, pread,
1362                          doVerificationThruHBaseChecksum);
1363       if (blk == null) {
1364         HFile.LOG.warn("HBase checksum verification failed for file " +
1365                        path + " at offset " +
1366                        offset + " filesize " + fileSize +
1367                        ". Retrying read with HDFS checksums turned on...");
1368 
1369         if (!doVerificationThruHBaseChecksum) {
1370           String msg = "HBase checksum verification failed for file " +
1371                        path + " at offset " +
1372                        offset + " filesize " + fileSize + 
1373                        " but this cannot happen because doVerify is " +
1374                        doVerificationThruHBaseChecksum;
1375           HFile.LOG.warn(msg);
1376           throw new IOException(msg); // cannot happen case here
1377         }
1378         HFile.checksumFailures.incrementAndGet(); // update metrics
1379 
1380         // If we have a checksum failure, we fall back into a mode where
1381         // the next few reads use HDFS level checksums. We aim to make the
1382         // next CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD reads avoid
1383         // hbase checksum verification, but since this value is set without
1384         // holding any locks, it can so happen that we might actually do
1385         // a few more than precisely this number.
1386         is = this.streamWrapper.fallbackToFsChecksum(CHECKSUM_VERIFICATION_NUM_IO_THRESHOLD);
1387         doVerificationThruHBaseChecksum = false;
1388         blk = readBlockDataInternal(is, offset, onDiskSizeWithHeaderL,
1389                                     uncompressedSize, pread,
1390                                     doVerificationThruHBaseChecksum);
1391         if (blk != null) {
1392           HFile.LOG.warn("HDFS checksum verification suceeded for file " +
1393                          path + " at offset " +
1394                          offset + " filesize " + fileSize);
1395         }
1396       } 
1397       if (blk == null && !doVerificationThruHBaseChecksum) {
1398         String msg = "readBlockData failed, possibly due to " +
1399                      "checksum verification failed for file " + path +
1400                      " at offset " + offset + " filesize " + fileSize;
1401         HFile.LOG.warn(msg);
1402         throw new IOException(msg);
1403       }
1404 
1405       // If there is a checksum mismatch earlier, then retry with 
1406       // HBase checksums switched off and use HDFS checksum verification.
1407       // This triggers HDFS to detect and fix corrupt replicas. The
1408       // next checksumOffCount read requests will use HDFS checksums.
1409       // The decrementing of this.checksumOffCount is not thread-safe,
1410       // but it is harmless because eventually checksumOffCount will be
1411       // a negative number.
1412       streamWrapper.checksumOk();
1413       return blk;
1414     }
1415 
1416     /**
1417      * Reads a version 2 block. 
1418      *
1419      * @param offset the offset in the stream to read at
1420      * @param onDiskSizeWithHeaderL the on-disk size of the block, including
1421      *          the header, or -1 if unknown
1422      * @param uncompressedSize the uncompressed size of the the block. Always
1423      *          expected to be -1. This parameter is only used in version 1.
1424      * @param pread whether to use a positional read
1425      * @param verifyChecksum Whether to use HBase checksums. 
1426      *        If HBase checksum is switched off, then use HDFS checksum.
1427      * @return the HFileBlock or null if there is a HBase checksum mismatch
1428      */
1429     private HFileBlock readBlockDataInternal(FSDataInputStream is, long offset, 
1430         long onDiskSizeWithHeaderL, int uncompressedSize, boolean pread,
1431         boolean verifyChecksum) throws IOException {
1432       if (offset < 0) {
1433         throw new IOException("Invalid offset=" + offset + " trying to read "
1434             + "block (onDiskSize=" + onDiskSizeWithHeaderL
1435             + ", uncompressedSize=" + uncompressedSize + ")");
1436       }
1437       if (uncompressedSize != -1) {
1438         throw new IOException("Version 2 block reader API does not need " +
1439             "the uncompressed size parameter");
1440       }
1441 
1442       if ((onDiskSizeWithHeaderL < hdrSize && onDiskSizeWithHeaderL != -1)
1443           || onDiskSizeWithHeaderL >= Integer.MAX_VALUE) {
1444         throw new IOException("Invalid onDisksize=" + onDiskSizeWithHeaderL
1445             + ": expected to be at least " + hdrSize
1446             + " and at most " + Integer.MAX_VALUE + ", or -1 (offset="
1447             + offset + ", uncompressedSize=" + uncompressedSize + ")");
1448       }
1449 
1450       int onDiskSizeWithHeader = (int) onDiskSizeWithHeaderL;
1451       // See if we can avoid reading the header. This is desirable, because
1452       // we will not incur a backward seek operation if we have already
1453       // read this block's header as part of the previous read's look-ahead.
1454       // And we also want to skip reading the header again if it has already
1455       // been read.
1456       PrefetchedHeader prefetchedHeader = prefetchedHeaderForThread.get();
1457       ByteBuffer headerBuf = prefetchedHeader.offset == offset ?
1458           prefetchedHeader.buf : null;
1459 
1460       int nextBlockOnDiskSize = 0;
1461       // Allocate enough space to fit the next block's header too.
1462       byte[] onDiskBlock = null;
1463 
1464       HFileBlock b = null;
1465       if (onDiskSizeWithHeader > 0) {
1466         // We know the total on-disk size but not the uncompressed size. Read
1467         // the entire block into memory, then parse the header and decompress
1468         // from memory if using compression. This code path is used when
1469         // doing a random read operation relying on the block index, as well as
1470         // when the client knows the on-disk size from peeking into the next
1471         // block's header (e.g. this block's header) when reading the previous
1472         // block. This is the faster and more preferable case.
1473 
1474         // Size that we have to skip in case we have already read the header.
1475         int preReadHeaderSize = headerBuf == null ? 0 : hdrSize;
1476         onDiskBlock = new byte[onDiskSizeWithHeader + hdrSize];
1477         nextBlockOnDiskSize = readAtOffset(is, onDiskBlock,
1478             preReadHeaderSize, onDiskSizeWithHeader - preReadHeaderSize,
1479             true, offset + preReadHeaderSize, pread);
1480         if (headerBuf != null) {
1481           // the header has been read when reading the previous block, copy
1482           // to this block's header
1483           System.arraycopy(headerBuf.array(),
1484               headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize);
1485         } else {
1486           headerBuf = ByteBuffer.wrap(onDiskBlock, 0, hdrSize);
1487         }
1488         // We know the total on-disk size but not the uncompressed size. Read
1489         // the entire block into memory, then parse the header and decompress
1490         // from memory if using compression. Here we have already read the
1491         // block's header
1492         try {
1493           b = new HFileBlock(headerBuf, getMinorVersion());
1494         } catch (IOException ex) {
1495           // Seen in load testing. Provide comprehensive debug info.
1496           throw new IOException("Failed to read compressed block at "
1497               + offset
1498               + ", onDiskSizeWithoutHeader="
1499               + onDiskSizeWithHeader
1500               + ", preReadHeaderSize="
1501               + hdrSize
1502               + ", header.length="
1503               + prefetchedHeader.header.length
1504               + ", header bytes: "
1505               + Bytes.toStringBinary(prefetchedHeader.header, 0,
1506                   hdrSize), ex);
1507         }
1508         // if the caller specifies a onDiskSizeWithHeader, validate it.
1509         int onDiskSizeWithoutHeader = onDiskSizeWithHeader - hdrSize;
1510         assert onDiskSizeWithoutHeader >= 0;
1511         b.validateOnDiskSizeWithoutHeader(onDiskSizeWithoutHeader);
1512       } else {
1513         // Check headerBuf to see if we have read this block's header as part of
1514         // reading the previous block. This is an optimization of peeking into
1515         // the next block's header (e.g.this block's header) when reading the
1516         // previous block. This is the faster and more preferable case. If the
1517         // header is already there, don't read the header again.
1518 
1519         // Unfortunately, we still have to do a separate read operation to
1520         // read the header.
1521         if (headerBuf == null) {
1522           // From the header, determine the on-disk size of the given hfile
1523           // block, and read the remaining data, thereby incurring two read
1524           // operations. This might happen when we are doing the first read
1525           // in a series of reads or a random read, and we don't have access
1526           // to the block index. This is costly and should happen very rarely.
1527           headerBuf = ByteBuffer.allocate(hdrSize);
1528           readAtOffset(is, headerBuf.array(), headerBuf.arrayOffset(),
1529               hdrSize, false, offset, pread);
1530         }
1531 
1532         b = new HFileBlock(headerBuf, getMinorVersion());
1533         onDiskBlock = new byte[b.getOnDiskSizeWithHeader() + hdrSize];
1534         System.arraycopy(headerBuf.array(),
1535               headerBuf.arrayOffset(), onDiskBlock, 0, hdrSize);
1536         nextBlockOnDiskSize =
1537           readAtOffset(is, onDiskBlock, hdrSize, b.getOnDiskSizeWithHeader()
1538               - hdrSize, true, offset + hdrSize, pread);
1539         onDiskSizeWithHeader = b.onDiskSizeWithoutHeader + hdrSize;
1540       }
1541 
1542       boolean isCompressed =
1543         compressAlgo != null
1544             && compressAlgo != Compression.Algorithm.NONE;
1545       if (!isCompressed) {
1546         b.assumeUncompressed();
1547       }
1548 
1549       if (verifyChecksum &&
1550           !validateBlockChecksum(b, onDiskBlock, hdrSize)) {
1551         return null;             // checksum mismatch
1552       }
1553 
1554       if (isCompressed) {
1555         // This will allocate a new buffer but keep header bytes.
1556         b.allocateBuffer(nextBlockOnDiskSize > 0);
1557         if (b.blockType.equals(BlockType.ENCODED_DATA)) {
1558           encodedBlockDecodingCtx.prepareDecoding(b.getOnDiskSizeWithoutHeader(),
1559               b.getUncompressedSizeWithoutHeader(), b.getBufferWithoutHeader(), onDiskBlock,
1560               hdrSize);
1561         } else {
1562           defaultDecodingCtx.prepareDecoding(b.getOnDiskSizeWithoutHeader(),
1563               b.getUncompressedSizeWithoutHeader(), b.getBufferWithoutHeader(), onDiskBlock,
1564               hdrSize);
1565         }
1566         if (nextBlockOnDiskSize > 0) {
1567           // Copy next block's header bytes into the new block if we have them.
1568           System.arraycopy(onDiskBlock, onDiskSizeWithHeader, b.buf.array(),
1569               b.buf.arrayOffset() + hdrSize
1570               + b.uncompressedSizeWithoutHeader + b.totalChecksumBytes(),
1571               hdrSize);
1572         }
1573       } else {
1574         // The onDiskBlock will become the headerAndDataBuffer for this block.
1575         // If nextBlockOnDiskSizeWithHeader is not zero, the onDiskBlock already
1576         // contains the header of next block, so no need to set next
1577         // block's header in it.
1578         b = new HFileBlock(ByteBuffer.wrap(onDiskBlock, 0,
1579                 onDiskSizeWithHeader), getMinorVersion());
1580       }
1581 
1582       b.nextBlockOnDiskSizeWithHeader = nextBlockOnDiskSize;
1583 
1584       // Set prefetched header
1585       if (b.nextBlockOnDiskSizeWithHeader > 0) {
1586         prefetchedHeader.offset = offset + b.getOnDiskSizeWithHeader();
1587         System.arraycopy(onDiskBlock, onDiskSizeWithHeader,
1588             prefetchedHeader.header, 0, hdrSize);
1589       }
1590 
1591       b.includesMemstoreTS = includesMemstoreTS;
1592       b.offset = offset;
1593       return b;
1594     }
1595 
1596     void setIncludesMemstoreTS(boolean enabled) {
1597       includesMemstoreTS = enabled;
1598     }
1599 
1600     void setDataBlockEncoder(HFileDataBlockEncoder encoder) {
1601       this.dataBlockEncoder = encoder;
1602       encodedBlockDecodingCtx = encoder.newOnDiskDataBlockDecodingContext(
1603           this.compressAlgo);
1604     }
1605 
1606     /**
1607      * Generates the checksum for the header as well as the data and
1608      * then validates that it matches the value stored in the header.
1609      * If there is a checksum mismatch, then return false. Otherwise
1610      * return true.
1611      */
1612     protected boolean validateBlockChecksum(HFileBlock block, 
1613       byte[] data, int hdrSize) throws IOException {
1614       return ChecksumUtil.validateBlockChecksum(path, block,
1615                                                 data, hdrSize);
1616     }
1617 
1618     @Override
1619     public void closeStreams() throws IOException {
1620       streamWrapper.close();
1621     }
1622   }
1623 
1624   @Override
1625   public int getSerializedLength() {
1626     if (buf != null) {
1627       return this.buf.limit() + HFileBlock.EXTRA_SERIALIZATION_SPACE;
1628     }
1629     return 0;
1630   }
1631 
1632   @Override
1633   public void serialize(ByteBuffer destination) {
1634     ByteBuffer dupBuf = this.buf.duplicate();
1635     dupBuf.rewind();
1636     destination.put(dupBuf);
1637     destination.putInt(this.minorVersion);
1638     destination.putLong(this.offset);
1639     destination.putInt(this.nextBlockOnDiskSizeWithHeader);
1640     destination.rewind();
1641   }
1642 
1643   public void serializeExtraInfo(ByteBuffer destination) {
1644     destination.putInt(this.minorVersion);
1645     destination.putLong(this.offset);
1646     destination.putInt(this.nextBlockOnDiskSizeWithHeader);
1647     destination.rewind();
1648   }
1649 
1650   @Override
1651   public CacheableDeserializer<Cacheable> getDeserializer() {
1652     return HFileBlock.blockDeserializer;
1653   }
1654 
1655   @Override
1656   public boolean equals(Object comparison) {
1657     if (this == comparison) {
1658       return true;
1659     }
1660     if (comparison == null) {
1661       return false;
1662     }
1663     if (comparison.getClass() != this.getClass()) {
1664       return false;
1665     }
1666 
1667     HFileBlock castedComparison = (HFileBlock) comparison;
1668 
1669     if (castedComparison.blockType != this.blockType) {
1670       return false;
1671     }
1672     if (castedComparison.nextBlockOnDiskSizeWithHeader != this.nextBlockOnDiskSizeWithHeader) {
1673       return false;
1674     }
1675     if (castedComparison.offset != this.offset) {
1676       return false;
1677     }
1678     if (castedComparison.onDiskSizeWithoutHeader != this.onDiskSizeWithoutHeader) {
1679       return false;
1680     }
1681     if (castedComparison.prevBlockOffset != this.prevBlockOffset) {
1682       return false;
1683     }
1684     if (castedComparison.uncompressedSizeWithoutHeader != this.uncompressedSizeWithoutHeader) {
1685       return false;
1686     }
1687     if (this.buf.compareTo(castedComparison.buf) != 0) {
1688       return false;
1689     }
1690     if (this.buf.position() != castedComparison.buf.position()){
1691       return false;
1692     }
1693     if (this.buf.limit() != castedComparison.buf.limit()){
1694       return false;
1695     }
1696     return true;
1697   }
1698 
1699   public boolean doesIncludeMemstoreTS() {
1700     return includesMemstoreTS;
1701   }
1702 
1703   public DataBlockEncoding getDataBlockEncoding() {
1704     if (blockType == BlockType.ENCODED_DATA) {
1705       return DataBlockEncoding.getEncodingById(getDataBlockEncodingId());
1706     }
1707     return DataBlockEncoding.NONE;
1708   }
1709 
1710   byte getChecksumType() {
1711     return this.checksumType;
1712   }
1713 
1714   int getBytesPerChecksum() {
1715     return this.bytesPerChecksum;
1716   }
1717 
1718   int getOnDiskDataSizeWithHeader() {
1719     return this.onDiskDataSizeWithHeader;
1720   }
1721 
1722   int getMinorVersion() {
1723     return this.minorVersion;
1724   }
1725 
1726   /** 
1727    * Calcuate the number of bytes required to store all the checksums
1728    * for this block. Each checksum value is a 4 byte integer.
1729    */
1730   int totalChecksumBytes() {
1731     // If the hfile block has minorVersion 0, then there are no checksum
1732     // data to validate. Similarly, a zero value in this.bytesPerChecksum
1733     // indicates that cached blocks do not have checksum data because
1734     // checksums were already validated when the block was read from disk.
1735     if (minorVersion < MINOR_VERSION_WITH_CHECKSUM || this.bytesPerChecksum == 0) {
1736       return 0;
1737     }
1738     return (int)ChecksumUtil.numBytes(onDiskDataSizeWithHeader, bytesPerChecksum);
1739   }
1740 
1741   /**
1742    * Returns the size of this block header.
1743    */
1744   public int headerSize() {
1745     return headerSize(this.minorVersion);
1746   }
1747 
1748   /**
1749    * Maps a minor version to the size of the header.
1750    */
1751   public static int headerSize(int minorVersion) {
1752     if (minorVersion < MINOR_VERSION_WITH_CHECKSUM) {
1753       return HConstants.HFILEBLOCK_HEADER_SIZE_NO_CHECKSUM;
1754     }
1755     return HConstants.HFILEBLOCK_HEADER_SIZE;
1756   }
1757 
1758   /**
1759    * Return the appropriate DUMMY_HEADER for the minor version
1760    */
1761   public byte[] getDummyHeaderForVersion() {
1762     return getDummyHeaderForVersion(minorVersion);
1763   }
1764 
1765   /**
1766    * Return the appropriate DUMMY_HEADER for the minor version
1767    */
1768   static private byte[] getDummyHeaderForVersion(int minorVersion) {
1769     if (minorVersion < MINOR_VERSION_WITH_CHECKSUM) {
1770       return DUMMY_HEADER_NO_CHECKSUM;
1771     }
1772     return HConstants.HFILEBLOCK_DUMMY_HEADER;
1773   }
1774 
1775   /**
1776    * Convert the contents of the block header into a human readable string.
1777    * This is mostly helpful for debugging. This assumes that the block
1778    * has minor version > 0.
1779    */
1780   static String toStringHeader(ByteBuffer buf) throws IOException {
1781     int offset = buf.arrayOffset();
1782     byte[] b = buf.array();
1783     long magic = Bytes.toLong(b, offset); 
1784     BlockType bt = BlockType.read(buf);
1785     offset += Bytes.SIZEOF_LONG;
1786     int compressedBlockSizeNoHeader = Bytes.toInt(b, offset);
1787     offset += Bytes.SIZEOF_INT;
1788     int uncompressedBlockSizeNoHeader = Bytes.toInt(b, offset);
1789     offset += Bytes.SIZEOF_INT;
1790     long prevBlockOffset = Bytes.toLong(b, offset); 
1791     offset += Bytes.SIZEOF_LONG;
1792     byte cksumtype = b[offset];
1793     offset += Bytes.SIZEOF_BYTE;
1794     long bytesPerChecksum = Bytes.toInt(b, offset); 
1795     offset += Bytes.SIZEOF_INT;
1796     long onDiskDataSizeWithHeader = Bytes.toInt(b, offset); 
1797     offset += Bytes.SIZEOF_INT;
1798     return " Header dump: magic: " + magic +
1799                    " blockType " + bt +
1800                    " compressedBlockSizeNoHeader " + 
1801                    compressedBlockSizeNoHeader +
1802                    " uncompressedBlockSizeNoHeader " + 
1803                    uncompressedBlockSizeNoHeader +
1804                    " prevBlockOffset " + prevBlockOffset +
1805                    " checksumType " + ChecksumType.codeToType(cksumtype) +
1806                    " bytesPerChecksum " + bytesPerChecksum +
1807                    " onDiskDataSizeWithHeader " + onDiskDataSizeWithHeader;
1808   }
1809 }
1810