View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.io.hfile;
21  
22  import java.io.DataOutput;
23  import java.io.DataOutputStream;
24  import java.io.IOException;
25  import java.util.ArrayList;
26  import java.util.List;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.classification.InterfaceAudience;
31  import org.apache.hadoop.conf.Configuration;
32  import org.apache.hadoop.fs.FSDataOutputStream;
33  import org.apache.hadoop.fs.FileSystem;
34  import org.apache.hadoop.fs.Path;
35  import org.apache.hadoop.hbase.KeyValue;
36  import org.apache.hadoop.hbase.KeyValue.KeyComparator;
37  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
38  import org.apache.hadoop.hbase.io.compress.Compression;
39  import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
40  import org.apache.hadoop.hbase.io.hfile.HFileBlock.BlockWritable;
41  import org.apache.hadoop.hbase.util.ChecksumType;
42  import org.apache.hadoop.hbase.util.BloomFilterWriter;
43  import org.apache.hadoop.hbase.util.Bytes;
44  import org.apache.hadoop.io.Writable;
45  import org.apache.hadoop.io.WritableUtils;
46  
47  /**
48   * Writes HFile format version 2.
49   */
50  @InterfaceAudience.Private
51  public class HFileWriterV2 extends AbstractHFileWriter {
52    static final Log LOG = LogFactory.getLog(HFileWriterV2.class);
53  
54    /** Max memstore (mvcc) timestamp in FileInfo */
55    public static final byte [] MAX_MEMSTORE_TS_KEY =
56        Bytes.toBytes("MAX_MEMSTORE_TS_KEY");
57  
58    /** KeyValue version in FileInfo */
59    public static final byte [] KEY_VALUE_VERSION =
60        Bytes.toBytes("KEY_VALUE_VERSION");
61  
62    /** Version for KeyValue which includes memstore timestamp */
63    public static final int KEY_VALUE_VER_WITH_MEMSTORE = 1;
64  
65    /** Inline block writers for multi-level block index and compound Blooms. */
66    private List<InlineBlockWriter> inlineBlockWriters =
67        new ArrayList<InlineBlockWriter>();
68  
69    /** Unified version 2 block writer */
70    private HFileBlock.Writer fsBlockWriter;
71  
72    private HFileBlockIndex.BlockIndexWriter dataBlockIndexWriter;
73    private HFileBlockIndex.BlockIndexWriter metaBlockIndexWriter;
74  
75    /** The offset of the first data block or -1 if the file is empty. */
76    private long firstDataBlockOffset = -1;
77  
78    /** The offset of the last data block or 0 if the file is empty. */
79    private long lastDataBlockOffset;
80  
81    /** The last(stop) Key of the previous data block. */
82    private byte[] lastKeyOfPreviousBlock = null;
83  
84    /** Additional data items to be written to the "load-on-open" section. */
85    private List<BlockWritable> additionalLoadOnOpenData =
86      new ArrayList<BlockWritable>();
87  
88    /** Checksum related settings */
89    private ChecksumType checksumType = HFile.DEFAULT_CHECKSUM_TYPE;
90    private int bytesPerChecksum = HFile.DEFAULT_BYTES_PER_CHECKSUM;
91  
92    private final boolean includeMemstoreTS;
93    private long maxMemstoreTS = 0;
94  
95    static class WriterFactoryV2 extends HFile.WriterFactory {
96      WriterFactoryV2(Configuration conf, CacheConfig cacheConf) {
97        super(conf, cacheConf);
98      }
99  
100     @Override
101     public Writer createWriter(FileSystem fs, Path path,
102         FSDataOutputStream ostream, int blockSize,
103         Compression.Algorithm compress, HFileDataBlockEncoder blockEncoder,
104         final KeyComparator comparator, final ChecksumType checksumType,
105         final int bytesPerChecksum, boolean includeMVCCReadpoint) throws IOException {
106       return new HFileWriterV2(conf, cacheConf, fs, path, ostream, blockSize, compress,
107           blockEncoder, comparator, checksumType, bytesPerChecksum, includeMVCCReadpoint);
108     }
109   }
110 
111   /** Constructor that takes a path, creates and closes the output stream. */
112   public HFileWriterV2(Configuration conf, CacheConfig cacheConf,
113       FileSystem fs, Path path, FSDataOutputStream ostream, int blockSize,
114       Compression.Algorithm compressAlgo, HFileDataBlockEncoder blockEncoder,
115       final KeyComparator comparator, final ChecksumType checksumType,
116       final int bytesPerChecksum, final boolean includeMVCCReadpoint) throws IOException {
117     super(cacheConf,
118         ostream == null ? createOutputStream(conf, fs, path) : ostream,
119         path, blockSize, compressAlgo, blockEncoder, comparator);
120     this.checksumType = checksumType;
121     this.bytesPerChecksum = bytesPerChecksum;
122     this.includeMemstoreTS = includeMVCCReadpoint;
123     finishInit(conf);
124   }
125 
126   /** Additional initialization steps */
127   private void finishInit(final Configuration conf) {
128     if (fsBlockWriter != null)
129       throw new IllegalStateException("finishInit called twice");
130 
131     // HFile filesystem-level (non-caching) block writer
132     fsBlockWriter = new HFileBlock.Writer(compressAlgo, blockEncoder,
133         includeMemstoreTS, checksumType, bytesPerChecksum);
134 
135     // Data block index writer
136     boolean cacheIndexesOnWrite = cacheConf.shouldCacheIndexesOnWrite();
137     dataBlockIndexWriter = new HFileBlockIndex.BlockIndexWriter(fsBlockWriter,
138         cacheIndexesOnWrite ? cacheConf.getBlockCache(): null,
139         cacheIndexesOnWrite ? name : null);
140     dataBlockIndexWriter.setMaxChunkSize(
141         HFileBlockIndex.getMaxChunkSize(conf));
142     inlineBlockWriters.add(dataBlockIndexWriter);
143 
144     // Meta data block index writer
145     metaBlockIndexWriter = new HFileBlockIndex.BlockIndexWriter();
146     LOG.debug("Initialized with " + cacheConf);
147   }
148 
149   /**
150    * At a block boundary, write all the inline blocks and opens new block.
151    *
152    * @throws IOException
153    */
154   private void checkBlockBoundary() throws IOException {
155     if (fsBlockWriter.blockSizeWritten() < blockSize)
156       return;
157 
158     finishBlock();
159     writeInlineBlocks(false);
160     newBlock();
161   }
162 
163   /** Clean up the current block */
164   private void finishBlock() throws IOException {
165     if (!fsBlockWriter.isWriting() || fsBlockWriter.blockSizeWritten() == 0)
166       return;
167 
168     long startTimeNs = System.nanoTime();
169     // Update the first data block offset for scanning.
170     if (firstDataBlockOffset == -1) {
171       firstDataBlockOffset = outputStream.getPos();
172     }
173     // Update the last data block offset
174     lastDataBlockOffset = outputStream.getPos();
175     fsBlockWriter.writeHeaderAndData(outputStream);
176     int onDiskSize = fsBlockWriter.getOnDiskSizeWithHeader();
177     // Generate a shorter faked key into index block. For example, consider a block boundary
178     // between the keys "the quick brown fox" and "the who test text".  We can use "the r" as the 
179     // key for the index block entry since it is > all entries in the previous block and <= all
180     // entries in subsequent blocks.
181     if (comparator instanceof KeyComparator) {
182       byte[] fakeKey = ((KeyComparator) comparator).getShortMidpointKey(
183         lastKeyOfPreviousBlock, firstKeyInBlock);
184       if (comparator.compare(fakeKey, firstKeyInBlock) > 0) {
185         throw new IOException("Unexpected getShortMidpointKey result, fakeKey:" + fakeKey
186             + ", firstKeyInBlock:" + firstKeyInBlock);
187       }
188       if (lastKeyOfPreviousBlock != null && comparator.compare(lastKeyOfPreviousBlock,
189         fakeKey) >= 0) {
190         throw new IOException("Unexpected getShortMidpointKey result, lastKeyOfPreviousBlock:" +
191             Bytes.toString(lastKeyOfPreviousBlock) + ", fakeKey:" +
192             Bytes.toString(fakeKey));
193       }
194       dataBlockIndexWriter.addEntry(fakeKey, lastDataBlockOffset,onDiskSize);
195     } else {
196       dataBlockIndexWriter.addEntry(firstKeyInBlock, lastDataBlockOffset,onDiskSize);
197     }
198     totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
199     HFile.offerWriteLatency(System.nanoTime() - startTimeNs);
200     if (cacheConf.shouldCacheDataOnWrite()) {
201       doCacheOnWrite(lastDataBlockOffset);
202     }
203   }
204 
205   /** Gives inline block writers an opportunity to contribute blocks. */
206   private void writeInlineBlocks(boolean closing) throws IOException {
207     for (InlineBlockWriter ibw : inlineBlockWriters) {
208       while (ibw.shouldWriteBlock(closing)) {
209         long offset = outputStream.getPos();
210         boolean cacheThisBlock = ibw.getCacheOnWrite();
211         ibw.writeInlineBlock(fsBlockWriter.startWriting(
212             ibw.getInlineBlockType()));
213         fsBlockWriter.writeHeaderAndData(outputStream);
214         ibw.blockWritten(offset, fsBlockWriter.getOnDiskSizeWithHeader(),
215             fsBlockWriter.getUncompressedSizeWithoutHeader());
216         totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
217 
218         if (cacheThisBlock) {
219           doCacheOnWrite(offset);
220         }
221       }
222     }
223   }
224 
225   /**
226    * Caches the last written HFile block.
227    * @param offset the offset of the block we want to cache. Used to determine
228    *          the cache key.
229    */
230   private void doCacheOnWrite(long offset) {
231     // We don't cache-on-write data blocks on compaction, so assume this is not
232     // a compaction.
233     final boolean isCompaction = false;
234     HFileBlock cacheFormatBlock = blockEncoder.diskToCacheFormat(
235         fsBlockWriter.getBlockForCaching(), isCompaction);
236     cacheConf.getBlockCache().cacheBlock(
237         new BlockCacheKey(name, offset, blockEncoder.getEncodingInCache(),
238             cacheFormatBlock.getBlockType()), cacheFormatBlock);
239   }
240 
241   /**
242    * Ready a new block for writing.
243    *
244    * @throws IOException
245    */
246   private void newBlock() throws IOException {
247     // This is where the next block begins.
248     fsBlockWriter.startWriting(BlockType.DATA);
249     firstKeyInBlock = null;
250     if (lastKeyLength > 0) {
251       lastKeyOfPreviousBlock = new byte[lastKeyLength];
252       System.arraycopy(lastKeyBuffer, lastKeyOffset, lastKeyOfPreviousBlock, 0, lastKeyLength);
253     }
254   }
255 
256   /**
257    * Add a meta block to the end of the file. Call before close(). Metadata
258    * blocks are expensive. Fill one with a bunch of serialized data rather than
259    * do a metadata block per metadata instance. If metadata is small, consider
260    * adding to file info using {@link #appendFileInfo(byte[], byte[])}
261    *
262    * @param metaBlockName
263    *          name of the block
264    * @param content
265    *          will call readFields to get data later (DO NOT REUSE)
266    */
267   @Override
268   public void appendMetaBlock(String metaBlockName, Writable content) {
269     byte[] key = Bytes.toBytes(metaBlockName);
270     int i;
271     for (i = 0; i < metaNames.size(); ++i) {
272       // stop when the current key is greater than our own
273       byte[] cur = metaNames.get(i);
274       if (Bytes.BYTES_RAWCOMPARATOR.compare(cur, 0, cur.length, key, 0,
275           key.length) > 0) {
276         break;
277       }
278     }
279     metaNames.add(i, key);
280     metaData.add(i, content);
281   }
282 
283   /**
284    * Add key/value to file. Keys must be added in an order that agrees with the
285    * Comparator passed on construction.
286    *
287    * @param kv
288    *          KeyValue to add. Cannot be empty nor null.
289    * @throws IOException
290    */
291   @Override
292   public void append(final KeyValue kv) throws IOException {
293     append(kv.getMemstoreTS(), kv.getBuffer(), kv.getKeyOffset(), kv.getKeyLength(),
294         kv.getBuffer(), kv.getValueOffset(), kv.getValueLength());
295     this.maxMemstoreTS = Math.max(this.maxMemstoreTS, kv.getMemstoreTS());
296   }
297 
298   /**
299    * Add key/value to file. Keys must be added in an order that agrees with the
300    * Comparator passed on construction.
301    *
302    * @param key
303    *          Key to add. Cannot be empty nor null.
304    * @param value
305    *          Value to add. Cannot be empty nor null.
306    * @throws IOException
307    */
308   @Override
309   public void append(final byte[] key, final byte[] value) throws IOException {
310     append(0, key, 0, key.length, value, 0, value.length);
311   }
312 
313   /**
314    * Add key/value to file. Keys must be added in an order that agrees with the
315    * Comparator passed on construction.
316    *
317    * @param key
318    * @param koffset
319    * @param klength
320    * @param value
321    * @param voffset
322    * @param vlength
323    * @throws IOException
324    */
325   private void append(final long memstoreTS, final byte[] key, final int koffset, final int klength,
326       final byte[] value, final int voffset, final int vlength)
327       throws IOException {
328     boolean dupKey = checkKey(key, koffset, klength);
329     checkValue(value, voffset, vlength);
330     if (!dupKey) {
331       checkBlockBoundary();
332     }
333 
334     if (!fsBlockWriter.isWriting())
335       newBlock();
336 
337     // Write length of key and value and then actual key and value bytes.
338     // Additionally, we may also write down the memstoreTS.
339     {
340       DataOutputStream out = fsBlockWriter.getUserDataStream();
341       out.writeInt(klength);
342       totalKeyLength += klength;
343       out.writeInt(vlength);
344       totalValueLength += vlength;
345       out.write(key, koffset, klength);
346       out.write(value, voffset, vlength);
347       if (this.includeMemstoreTS) {
348         WritableUtils.writeVLong(out, memstoreTS);
349       }
350     }
351 
352     // Are we the first key in this block?
353     if (firstKeyInBlock == null) {
354       // Copy the key.
355       firstKeyInBlock = new byte[klength];
356       System.arraycopy(key, koffset, firstKeyInBlock, 0, klength);
357     }
358 
359     lastKeyBuffer = key;
360     lastKeyOffset = koffset;
361     lastKeyLength = klength;
362     entryCount++;
363   }
364 
365   @Override
366   public void close() throws IOException {
367     if (outputStream == null) {
368       return;
369     }
370     // Save data block encoder metadata in the file info.
371     blockEncoder.saveMetadata(this);
372     // Write out the end of the data blocks, then write meta data blocks.
373     // followed by fileinfo, data block index and meta block index.
374 
375     finishBlock();
376     writeInlineBlocks(true);
377 
378     FixedFileTrailer trailer = new FixedFileTrailer(2, 
379                                  HFileReaderV2.MAX_MINOR_VERSION);
380 
381     // Write out the metadata blocks if any.
382     if (!metaNames.isEmpty()) {
383       for (int i = 0; i < metaNames.size(); ++i) {
384         // store the beginning offset
385         long offset = outputStream.getPos();
386         // write the metadata content
387         DataOutputStream dos = fsBlockWriter.startWriting(BlockType.META);
388         metaData.get(i).write(dos);
389 
390         fsBlockWriter.writeHeaderAndData(outputStream);
391         totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
392 
393         // Add the new meta block to the meta index.
394         metaBlockIndexWriter.addEntry(metaNames.get(i), offset,
395             fsBlockWriter.getOnDiskSizeWithHeader());
396       }
397     }
398 
399     // Load-on-open section.
400 
401     // Data block index.
402     //
403     // In version 2, this section of the file starts with the root level data
404     // block index. We call a function that writes intermediate-level blocks
405     // first, then root level, and returns the offset of the root level block
406     // index.
407 
408     long rootIndexOffset = dataBlockIndexWriter.writeIndexBlocks(outputStream);
409     trailer.setLoadOnOpenOffset(rootIndexOffset);
410 
411     // Meta block index.
412     metaBlockIndexWriter.writeSingleLevelIndex(fsBlockWriter.startWriting(
413         BlockType.ROOT_INDEX), "meta");
414     fsBlockWriter.writeHeaderAndData(outputStream);
415     totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
416 
417     if (this.includeMemstoreTS) {
418       appendFileInfo(MAX_MEMSTORE_TS_KEY, Bytes.toBytes(maxMemstoreTS));
419       appendFileInfo(KEY_VALUE_VERSION, Bytes.toBytes(KEY_VALUE_VER_WITH_MEMSTORE));
420     }
421 
422     // File info
423     writeFileInfo(trailer, fsBlockWriter.startWriting(BlockType.FILE_INFO));
424     fsBlockWriter.writeHeaderAndData(outputStream);
425     totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
426 
427     // Load-on-open data supplied by higher levels, e.g. Bloom filters.
428     for (BlockWritable w : additionalLoadOnOpenData){
429       fsBlockWriter.writeBlock(w, outputStream);
430       totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
431     }
432 
433     // Now finish off the trailer.
434     trailer.setNumDataIndexLevels(dataBlockIndexWriter.getNumLevels());
435     trailer.setUncompressedDataIndexSize(
436         dataBlockIndexWriter.getTotalUncompressedSize());
437     trailer.setFirstDataBlockOffset(firstDataBlockOffset);
438     trailer.setLastDataBlockOffset(lastDataBlockOffset);
439     trailer.setComparatorClass(comparator.getClass());
440     trailer.setDataIndexCount(dataBlockIndexWriter.getNumRootEntries());
441 
442 
443     finishClose(trailer);
444 
445     fsBlockWriter.release();
446   }
447 
448   @Override
449   public void addInlineBlockWriter(InlineBlockWriter ibw) {
450     inlineBlockWriters.add(ibw);
451   }
452 
453   @Override
454   public void addGeneralBloomFilter(final BloomFilterWriter bfw) {
455     this.addBloomFilter(bfw, BlockType.GENERAL_BLOOM_META);
456   }
457 
458   @Override
459   public void addDeleteFamilyBloomFilter(final BloomFilterWriter bfw) {
460     this.addBloomFilter(bfw, BlockType.DELETE_FAMILY_BLOOM_META);
461   }
462 
463   private void addBloomFilter(final BloomFilterWriter bfw,
464       final BlockType blockType) {
465     if (bfw.getKeyCount() <= 0)
466       return;
467 
468     if (blockType != BlockType.GENERAL_BLOOM_META &&
469         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
470       throw new RuntimeException("Block Type: " + blockType.toString() +
471           "is not supported");
472     }
473     additionalLoadOnOpenData.add(new BlockWritable() {
474       @Override
475       public BlockType getBlockType() {
476         return blockType;
477       }
478 
479       @Override
480       public void writeToBlock(DataOutput out) throws IOException {
481         bfw.getMetaWriter().write(out);
482         Writable dataWriter = bfw.getDataWriter();
483         if (dataWriter != null)
484           dataWriter.write(out);
485       }
486     });
487   }
488 }