View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.io.hfile;
21  
22  import java.io.DataOutput;
23  import java.io.DataOutputStream;
24  import java.io.IOException;
25  import java.util.ArrayList;
26  import java.util.List;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.classification.InterfaceAudience;
31  import org.apache.hadoop.conf.Configuration;
32  import org.apache.hadoop.fs.FSDataOutputStream;
33  import org.apache.hadoop.fs.FileSystem;
34  import org.apache.hadoop.fs.Path;
35  import org.apache.hadoop.hbase.KeyValue;
36  import org.apache.hadoop.hbase.KeyValue.KeyComparator;
37  import org.apache.hadoop.hbase.io.compress.Compression;
38  import org.apache.hadoop.hbase.io.hfile.HFile.Writer;
39  import org.apache.hadoop.hbase.io.hfile.HFileBlock.BlockWritable;
40  import org.apache.hadoop.hbase.util.ChecksumType;
41  import org.apache.hadoop.hbase.util.BloomFilterWriter;
42  import org.apache.hadoop.hbase.util.Bytes;
43  import org.apache.hadoop.io.Writable;
44  import org.apache.hadoop.io.WritableUtils;
45  
46  /**
47   * Writes HFile format version 2.
48   */
49  @InterfaceAudience.Private
50  public class HFileWriterV2 extends AbstractHFileWriter {
51    static final Log LOG = LogFactory.getLog(HFileWriterV2.class);
52  
53    /** Max memstore (mvcc) timestamp in FileInfo */
54    public static final byte [] MAX_MEMSTORE_TS_KEY =
55        Bytes.toBytes("MAX_MEMSTORE_TS_KEY");
56  
57    /** KeyValue version in FileInfo */
58    public static final byte [] KEY_VALUE_VERSION =
59        Bytes.toBytes("KEY_VALUE_VERSION");
60  
61    /** Version for KeyValue which includes memstore timestamp */
62    public static final int KEY_VALUE_VER_WITH_MEMSTORE = 1;
63  
64    /** Inline block writers for multi-level block index and compound Blooms. */
65    private List<InlineBlockWriter> inlineBlockWriters =
66        new ArrayList<InlineBlockWriter>();
67  
68    /** Unified version 2 block writer */
69    private HFileBlock.Writer fsBlockWriter;
70  
71    private HFileBlockIndex.BlockIndexWriter dataBlockIndexWriter;
72    private HFileBlockIndex.BlockIndexWriter metaBlockIndexWriter;
73  
74    /** The offset of the first data block or -1 if the file is empty. */
75    private long firstDataBlockOffset = -1;
76  
77    /** The offset of the last data block or 0 if the file is empty. */
78    private long lastDataBlockOffset;
79  
80    /** The last(stop) Key of the previous data block. */
81    private byte[] lastKeyOfPreviousBlock = null;
82  
83    /** Additional data items to be written to the "load-on-open" section. */
84    private List<BlockWritable> additionalLoadOnOpenData =
85      new ArrayList<BlockWritable>();
86  
87    /** Checksum related settings */
88    private ChecksumType checksumType = HFile.DEFAULT_CHECKSUM_TYPE;
89    private int bytesPerChecksum = HFile.DEFAULT_BYTES_PER_CHECKSUM;
90  
91    private final boolean includeMemstoreTS;
92    private long maxMemstoreTS = 0;
93  
94    static class WriterFactoryV2 extends HFile.WriterFactory {
95      WriterFactoryV2(Configuration conf, CacheConfig cacheConf) {
96        super(conf, cacheConf);
97      }
98  
99      @Override
100     public Writer createWriter(FileSystem fs, Path path,
101         FSDataOutputStream ostream, int blockSize,
102         Compression.Algorithm compress, HFileDataBlockEncoder blockEncoder,
103         final KeyComparator comparator, final ChecksumType checksumType,
104         final int bytesPerChecksum, boolean includeMVCCReadpoint) throws IOException {
105       return new HFileWriterV2(conf, cacheConf, fs, path, ostream, blockSize, compress,
106           blockEncoder, comparator, checksumType, bytesPerChecksum, includeMVCCReadpoint);
107     }
108   }
109 
110   /** Constructor that takes a path, creates and closes the output stream. */
111   public HFileWriterV2(Configuration conf, CacheConfig cacheConf,
112       FileSystem fs, Path path, FSDataOutputStream ostream, int blockSize,
113       Compression.Algorithm compressAlgo, HFileDataBlockEncoder blockEncoder,
114       final KeyComparator comparator, final ChecksumType checksumType,
115       final int bytesPerChecksum, final boolean includeMVCCReadpoint) throws IOException {
116     super(cacheConf,
117         ostream == null ? createOutputStream(conf, fs, path, null) : ostream,
118         path, blockSize, compressAlgo, blockEncoder, comparator);
119     this.checksumType = checksumType;
120     this.bytesPerChecksum = bytesPerChecksum;
121     this.includeMemstoreTS = includeMVCCReadpoint;
122     finishInit(conf);
123   }
124 
125   /** Additional initialization steps */
126   private void finishInit(final Configuration conf) {
127     if (fsBlockWriter != null)
128       throw new IllegalStateException("finishInit called twice");
129 
130     // HFile filesystem-level (non-caching) block writer
131     fsBlockWriter = new HFileBlock.Writer(compressAlgo, blockEncoder,
132         includeMemstoreTS, checksumType, bytesPerChecksum);
133 
134     // Data block index writer
135     boolean cacheIndexesOnWrite = cacheConf.shouldCacheIndexesOnWrite();
136     dataBlockIndexWriter = new HFileBlockIndex.BlockIndexWriter(fsBlockWriter,
137         cacheIndexesOnWrite ? cacheConf.getBlockCache(): null,
138         cacheIndexesOnWrite ? name : null);
139     dataBlockIndexWriter.setMaxChunkSize(
140         HFileBlockIndex.getMaxChunkSize(conf));
141     inlineBlockWriters.add(dataBlockIndexWriter);
142 
143     // Meta data block index writer
144     metaBlockIndexWriter = new HFileBlockIndex.BlockIndexWriter();
145     if (LOG.isTraceEnabled()) LOG.trace("Initialized with " + cacheConf);
146   }
147 
148   /**
149    * At a block boundary, write all the inline blocks and opens new block.
150    *
151    * @throws IOException
152    */
153   private void checkBlockBoundary() throws IOException {
154     if (fsBlockWriter.blockSizeWritten() < blockSize)
155       return;
156 
157     finishBlock();
158     writeInlineBlocks(false);
159     newBlock();
160   }
161 
162   /** Clean up the current block */
163   private void finishBlock() throws IOException {
164     if (!fsBlockWriter.isWriting() || fsBlockWriter.blockSizeWritten() == 0)
165       return;
166 
167     long startTimeNs = System.nanoTime();
168     // Update the first data block offset for scanning.
169     if (firstDataBlockOffset == -1) {
170       firstDataBlockOffset = outputStream.getPos();
171     }
172     // Update the last data block offset
173     lastDataBlockOffset = outputStream.getPos();
174     fsBlockWriter.writeHeaderAndData(outputStream);
175     int onDiskSize = fsBlockWriter.getOnDiskSizeWithHeader();
176     // Generate a shorter faked key into index block. For example, consider a block boundary
177     // between the keys "the quick brown fox" and "the who test text".  We can use "the r" as the 
178     // key for the index block entry since it is > all entries in the previous block and <= all
179     // entries in subsequent blocks.
180     if (comparator instanceof KeyComparator) {
181       byte[] fakeKey = ((KeyComparator) comparator).getShortMidpointKey(
182         lastKeyOfPreviousBlock, firstKeyInBlock);
183       if (comparator.compare(fakeKey, firstKeyInBlock) > 0) {
184         throw new IOException("Unexpected getShortMidpointKey result, fakeKey:" + fakeKey
185             + ", firstKeyInBlock:" + firstKeyInBlock);
186       }
187       if (lastKeyOfPreviousBlock != null && comparator.compare(lastKeyOfPreviousBlock,
188         fakeKey) >= 0) {
189         throw new IOException("Unexpected getShortMidpointKey result, lastKeyOfPreviousBlock:" +
190             Bytes.toString(lastKeyOfPreviousBlock) + ", fakeKey:" +
191             Bytes.toString(fakeKey));
192       }
193       dataBlockIndexWriter.addEntry(fakeKey, lastDataBlockOffset,onDiskSize);
194     } else {
195       dataBlockIndexWriter.addEntry(firstKeyInBlock, lastDataBlockOffset,onDiskSize);
196     }
197     totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
198     HFile.offerWriteLatency(System.nanoTime() - startTimeNs);
199     if (cacheConf.shouldCacheDataOnWrite()) {
200       doCacheOnWrite(lastDataBlockOffset);
201     }
202   }
203 
204   /** Gives inline block writers an opportunity to contribute blocks. */
205   private void writeInlineBlocks(boolean closing) throws IOException {
206     for (InlineBlockWriter ibw : inlineBlockWriters) {
207       while (ibw.shouldWriteBlock(closing)) {
208         long offset = outputStream.getPos();
209         boolean cacheThisBlock = ibw.getCacheOnWrite();
210         ibw.writeInlineBlock(fsBlockWriter.startWriting(
211             ibw.getInlineBlockType()));
212         fsBlockWriter.writeHeaderAndData(outputStream);
213         ibw.blockWritten(offset, fsBlockWriter.getOnDiskSizeWithHeader(),
214             fsBlockWriter.getUncompressedSizeWithoutHeader());
215         totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
216 
217         if (cacheThisBlock) {
218           doCacheOnWrite(offset);
219         }
220       }
221     }
222   }
223 
224   /**
225    * Caches the last written HFile block.
226    * @param offset the offset of the block we want to cache. Used to determine
227    *          the cache key.
228    */
229   private void doCacheOnWrite(long offset) {
230     // We don't cache-on-write data blocks on compaction, so assume this is not
231     // a compaction.
232     final boolean isCompaction = false;
233     HFileBlock cacheFormatBlock = blockEncoder.diskToCacheFormat(
234         fsBlockWriter.getBlockForCaching(), isCompaction);
235     cacheConf.getBlockCache().cacheBlock(
236         new BlockCacheKey(name, offset, blockEncoder.getEncodingInCache(),
237             cacheFormatBlock.getBlockType()), cacheFormatBlock);
238   }
239 
240   /**
241    * Ready a new block for writing.
242    *
243    * @throws IOException
244    */
245   private void newBlock() throws IOException {
246     // This is where the next block begins.
247     fsBlockWriter.startWriting(BlockType.DATA);
248     firstKeyInBlock = null;
249     if (lastKeyLength > 0) {
250       lastKeyOfPreviousBlock = new byte[lastKeyLength];
251       System.arraycopy(lastKeyBuffer, lastKeyOffset, lastKeyOfPreviousBlock, 0, lastKeyLength);
252     }
253   }
254 
255   /**
256    * Add a meta block to the end of the file. Call before close(). Metadata
257    * blocks are expensive. Fill one with a bunch of serialized data rather than
258    * do a metadata block per metadata instance. If metadata is small, consider
259    * adding to file info using {@link #appendFileInfo(byte[], byte[])}
260    *
261    * @param metaBlockName
262    *          name of the block
263    * @param content
264    *          will call readFields to get data later (DO NOT REUSE)
265    */
266   @Override
267   public void appendMetaBlock(String metaBlockName, Writable content) {
268     byte[] key = Bytes.toBytes(metaBlockName);
269     int i;
270     for (i = 0; i < metaNames.size(); ++i) {
271       // stop when the current key is greater than our own
272       byte[] cur = metaNames.get(i);
273       if (Bytes.BYTES_RAWCOMPARATOR.compare(cur, 0, cur.length, key, 0,
274           key.length) > 0) {
275         break;
276       }
277     }
278     metaNames.add(i, key);
279     metaData.add(i, content);
280   }
281 
282   /**
283    * Add key/value to file. Keys must be added in an order that agrees with the
284    * Comparator passed on construction.
285    *
286    * @param kv
287    *          KeyValue to add. Cannot be empty nor null.
288    * @throws IOException
289    */
290   @Override
291   public void append(final KeyValue kv) throws IOException {
292     append(kv.getMvccVersion(), kv.getBuffer(), kv.getKeyOffset(), kv.getKeyLength(),
293         kv.getBuffer(), kv.getValueOffset(), kv.getValueLength());
294     this.maxMemstoreTS = Math.max(this.maxMemstoreTS, kv.getMvccVersion());
295   }
296 
297   /**
298    * Add key/value to file. Keys must be added in an order that agrees with the
299    * Comparator passed on construction.
300    *
301    * @param key
302    *          Key to add. Cannot be empty nor null.
303    * @param value
304    *          Value to add. Cannot be empty nor null.
305    * @throws IOException
306    */
307   @Override
308   public void append(final byte[] key, final byte[] value) throws IOException {
309     append(0, key, 0, key.length, value, 0, value.length);
310   }
311 
312   /**
313    * Add key/value to file. Keys must be added in an order that agrees with the
314    * Comparator passed on construction.
315    *
316    * @param key
317    * @param koffset
318    * @param klength
319    * @param value
320    * @param voffset
321    * @param vlength
322    * @throws IOException
323    */
324   private void append(final long memstoreTS, final byte[] key, final int koffset, final int klength,
325       final byte[] value, final int voffset, final int vlength)
326       throws IOException {
327     boolean dupKey = checkKey(key, koffset, klength);
328     checkValue(value, voffset, vlength);
329     if (!dupKey) {
330       checkBlockBoundary();
331     }
332 
333     if (!fsBlockWriter.isWriting())
334       newBlock();
335 
336     // Write length of key and value and then actual key and value bytes.
337     // Additionally, we may also write down the memstoreTS.
338     {
339       DataOutputStream out = fsBlockWriter.getUserDataStream();
340       out.writeInt(klength);
341       totalKeyLength += klength;
342       out.writeInt(vlength);
343       totalValueLength += vlength;
344       out.write(key, koffset, klength);
345       out.write(value, voffset, vlength);
346       if (this.includeMemstoreTS) {
347         WritableUtils.writeVLong(out, memstoreTS);
348       }
349     }
350 
351     // Are we the first key in this block?
352     if (firstKeyInBlock == null) {
353       // Copy the key.
354       firstKeyInBlock = new byte[klength];
355       System.arraycopy(key, koffset, firstKeyInBlock, 0, klength);
356     }
357 
358     lastKeyBuffer = key;
359     lastKeyOffset = koffset;
360     lastKeyLength = klength;
361     entryCount++;
362   }
363 
364   @Override
365   public void close() throws IOException {
366     if (outputStream == null) {
367       return;
368     }
369     // Save data block encoder metadata in the file info.
370     blockEncoder.saveMetadata(this);
371     // Write out the end of the data blocks, then write meta data blocks.
372     // followed by fileinfo, data block index and meta block index.
373 
374     finishBlock();
375     writeInlineBlocks(true);
376 
377     FixedFileTrailer trailer = new FixedFileTrailer(2, 
378                                  HFileReaderV2.MAX_MINOR_VERSION);
379 
380     // Write out the metadata blocks if any.
381     if (!metaNames.isEmpty()) {
382       for (int i = 0; i < metaNames.size(); ++i) {
383         // store the beginning offset
384         long offset = outputStream.getPos();
385         // write the metadata content
386         DataOutputStream dos = fsBlockWriter.startWriting(BlockType.META);
387         metaData.get(i).write(dos);
388 
389         fsBlockWriter.writeHeaderAndData(outputStream);
390         totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
391 
392         // Add the new meta block to the meta index.
393         metaBlockIndexWriter.addEntry(metaNames.get(i), offset,
394             fsBlockWriter.getOnDiskSizeWithHeader());
395       }
396     }
397 
398     // Load-on-open section.
399 
400     // Data block index.
401     //
402     // In version 2, this section of the file starts with the root level data
403     // block index. We call a function that writes intermediate-level blocks
404     // first, then root level, and returns the offset of the root level block
405     // index.
406 
407     long rootIndexOffset = dataBlockIndexWriter.writeIndexBlocks(outputStream);
408     trailer.setLoadOnOpenOffset(rootIndexOffset);
409 
410     // Meta block index.
411     metaBlockIndexWriter.writeSingleLevelIndex(fsBlockWriter.startWriting(
412         BlockType.ROOT_INDEX), "meta");
413     fsBlockWriter.writeHeaderAndData(outputStream);
414     totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
415 
416     if (this.includeMemstoreTS) {
417       appendFileInfo(MAX_MEMSTORE_TS_KEY, Bytes.toBytes(maxMemstoreTS));
418       appendFileInfo(KEY_VALUE_VERSION, Bytes.toBytes(KEY_VALUE_VER_WITH_MEMSTORE));
419     }
420 
421     // File info
422     writeFileInfo(trailer, fsBlockWriter.startWriting(BlockType.FILE_INFO));
423     fsBlockWriter.writeHeaderAndData(outputStream);
424     totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
425 
426     // Load-on-open data supplied by higher levels, e.g. Bloom filters.
427     for (BlockWritable w : additionalLoadOnOpenData){
428       fsBlockWriter.writeBlock(w, outputStream);
429       totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
430     }
431 
432     // Now finish off the trailer.
433     trailer.setNumDataIndexLevels(dataBlockIndexWriter.getNumLevels());
434     trailer.setUncompressedDataIndexSize(
435         dataBlockIndexWriter.getTotalUncompressedSize());
436     trailer.setFirstDataBlockOffset(firstDataBlockOffset);
437     trailer.setLastDataBlockOffset(lastDataBlockOffset);
438     trailer.setComparatorClass(comparator.getClass());
439     trailer.setDataIndexCount(dataBlockIndexWriter.getNumRootEntries());
440 
441 
442     finishClose(trailer);
443 
444     fsBlockWriter.release();
445   }
446 
447   @Override
448   public void addInlineBlockWriter(InlineBlockWriter ibw) {
449     inlineBlockWriters.add(ibw);
450   }
451 
452   @Override
453   public void addGeneralBloomFilter(final BloomFilterWriter bfw) {
454     this.addBloomFilter(bfw, BlockType.GENERAL_BLOOM_META);
455   }
456 
457   @Override
458   public void addDeleteFamilyBloomFilter(final BloomFilterWriter bfw) {
459     this.addBloomFilter(bfw, BlockType.DELETE_FAMILY_BLOOM_META);
460   }
461 
462   private void addBloomFilter(final BloomFilterWriter bfw,
463       final BlockType blockType) {
464     if (bfw.getKeyCount() <= 0)
465       return;
466 
467     if (blockType != BlockType.GENERAL_BLOOM_META &&
468         blockType != BlockType.DELETE_FAMILY_BLOOM_META) {
469       throw new RuntimeException("Block Type: " + blockType.toString() +
470           "is not supported");
471     }
472     additionalLoadOnOpenData.add(new BlockWritable() {
473       @Override
474       public BlockType getBlockType() {
475         return blockType;
476       }
477 
478       @Override
479       public void writeToBlock(DataOutput out) throws IOException {
480         bfw.getMetaWriter().write(out);
481         Writable dataWriter = bfw.getDataWriter();
482         if (dataWriter != null)
483           dataWriter.write(out);
484       }
485     });
486   }
487 }