View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.codec.prefixtree.encode;
20  
21  import java.io.IOException;
22  import java.io.OutputStream;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.Cell;
28  import org.apache.hadoop.hbase.CellUtil;
29  import org.apache.hadoop.hbase.KeyValueUtil;
30  import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
31  import org.apache.hadoop.hbase.codec.prefixtree.encode.column.ColumnSectionWriter;
32  import org.apache.hadoop.hbase.codec.prefixtree.encode.other.CellTypeEncoder;
33  import org.apache.hadoop.hbase.codec.prefixtree.encode.other.LongEncoder;
34  import org.apache.hadoop.hbase.codec.prefixtree.encode.row.RowSectionWriter;
35  import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
36  import org.apache.hadoop.hbase.io.CellOutputStream;
37  import org.apache.hadoop.hbase.util.ArrayUtils;
38  import org.apache.hadoop.hbase.util.ByteRange;
39  import org.apache.hadoop.hbase.util.SimpleByteRange;
40  import org.apache.hadoop.hbase.util.byterange.ByteRangeSet;
41  import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeHashSet;
42  import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeTreeSet;
43  import org.apache.hadoop.hbase.util.vint.UFIntTool;
44  import org.apache.hadoop.io.WritableUtils;
45  
46  /**
47   * This is the primary class for converting a CellOutputStream into an encoded byte[]. As Cells are
48   * added they are completely copied into the various encoding structures. This is important because
49   * usually the cells being fed in during compactions will be transient.<br/>
50   * <br/>
51   * Usage:<br/>
52   * 1) constructor<br/>
53   * 4) append cells in sorted order: write(Cell cell)<br/>
54   * 5) flush()<br/>
55   */
56  @InterfaceAudience.Private
57  public class PrefixTreeEncoder implements CellOutputStream {
58  
59    /**************** static ************************/
60  
61    protected static final Log LOG = LogFactory.getLog(PrefixTreeEncoder.class);
62  
63    //future-proof where HBase supports multiple families in a data block.
64    public static final boolean MULITPLE_FAMILIES_POSSIBLE = false;
65  
66    private static final boolean USE_HASH_COLUMN_SORTER = true;
67    private static final int INITIAL_PER_CELL_ARRAY_SIZES = 256;
68    private static final int VALUE_BUFFER_INIT_SIZE = 64 * 1024;
69  
70  
71    /**************** fields *************************/
72  
73    protected long numResets = 0L;
74  
75    protected OutputStream outputStream;
76  
77    /*
78     * Cannot change during a single block's encoding. If false, then substitute incoming Cell's
79     * mvccVersion with zero and write out the block as usual.
80     */
81    protected boolean includeMvccVersion;
82  
83    /*
84     * reusable ByteRanges used for communicating with the sorters/compilers
85     */
86    protected ByteRange rowRange;
87    protected ByteRange familyRange;
88    protected ByteRange qualifierRange;
89  
90    /*
91     * incoming Cell fields are copied into these arrays
92     */
93    protected long[] timestamps;
94    protected long[] mvccVersions;
95    protected byte[] typeBytes;
96    protected int[] valueOffsets;
97    protected byte[] values;
98  
99    protected PrefixTreeBlockMeta blockMeta;
100 
101   /*
102    * Sub-encoders for the simple long/byte fields of a Cell.  Add to these as each cell arrives and
103    * compile before flushing.
104    */
105   protected LongEncoder timestampEncoder;
106   protected LongEncoder mvccVersionEncoder;
107   protected CellTypeEncoder cellTypeEncoder;
108 
109   /*
110    * Structures used for collecting families and qualifiers, de-duplicating them, and sorting them
111    * so they can be passed to the tokenizers. Unlike row keys where we can detect duplicates by
112    * comparing only with the previous row key, families and qualifiers can arrive in unsorted order
113    * in blocks spanning multiple rows. We must collect them all into a set to de-duplicate them.
114    */
115   protected ByteRangeSet familyDeduplicator;
116   protected ByteRangeSet qualifierDeduplicator;
117 
118   /*
119    * Feed sorted byte[]s into these tokenizers which will convert the byte[]s to an in-memory
120    * trie structure with nodes connected by memory pointers (not serializable yet).
121    */
122   protected Tokenizer rowTokenizer;
123   protected Tokenizer familyTokenizer;
124   protected Tokenizer qualifierTokenizer;
125 
126   /*
127    * Writers take an in-memory trie, sort the nodes, calculate offsets and lengths, and write
128    * all information to an output stream of bytes that can be stored on disk.
129    */
130   protected RowSectionWriter rowWriter;
131   protected ColumnSectionWriter familyWriter;
132   protected ColumnSectionWriter qualifierWriter;
133 
134   /*
135    * Integers used for counting cells and bytes.  We keep track of the size of the Cells as if they
136    * were full KeyValues because some parts of HBase like to know the "unencoded size".
137    */
138   protected int totalCells = 0;
139   protected int totalUnencodedBytes = 0;//numBytes if the cells were KeyValues
140   protected int totalValueBytes = 0;
141   protected int maxValueLength = 0;
142   protected int totalBytes = 0;//
143 
144 
145   /***************** construct ***********************/
146 
147   public PrefixTreeEncoder(OutputStream outputStream, boolean includeMvccVersion) {
148     // used during cell accumulation
149     this.blockMeta = new PrefixTreeBlockMeta();
150     this.rowRange = new SimpleByteRange();
151     this.familyRange = new SimpleByteRange();
152     this.qualifierRange = new SimpleByteRange();
153     this.timestamps = new long[INITIAL_PER_CELL_ARRAY_SIZES];
154     this.mvccVersions = new long[INITIAL_PER_CELL_ARRAY_SIZES];
155     this.typeBytes = new byte[INITIAL_PER_CELL_ARRAY_SIZES];
156     this.valueOffsets = new int[INITIAL_PER_CELL_ARRAY_SIZES];
157     this.values = new byte[VALUE_BUFFER_INIT_SIZE];
158 
159     // used during compilation
160     this.familyDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
161         : new ByteRangeTreeSet();
162     this.qualifierDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
163         : new ByteRangeTreeSet();
164     this.timestampEncoder = new LongEncoder();
165     this.mvccVersionEncoder = new LongEncoder();
166     this.cellTypeEncoder = new CellTypeEncoder();
167     this.rowTokenizer = new Tokenizer();
168     this.familyTokenizer = new Tokenizer();
169     this.qualifierTokenizer = new Tokenizer();
170     this.rowWriter = new RowSectionWriter();
171     this.familyWriter = new ColumnSectionWriter();
172     this.qualifierWriter = new ColumnSectionWriter();
173 
174     reset(outputStream, includeMvccVersion);
175   }
176 
177   public void reset(OutputStream outputStream, boolean includeMvccVersion) {
178     ++numResets;
179     this.includeMvccVersion = includeMvccVersion;
180     this.outputStream = outputStream;
181     valueOffsets[0] = 0;
182 
183     familyDeduplicator.reset();
184     qualifierDeduplicator.reset();
185     rowTokenizer.reset();
186     timestampEncoder.reset();
187     mvccVersionEncoder.reset();
188     cellTypeEncoder.reset();
189     familyTokenizer.reset();
190     qualifierTokenizer.reset();
191     rowWriter.reset();
192     familyWriter.reset();
193     qualifierWriter.reset();
194 
195     totalCells = 0;
196     totalUnencodedBytes = 0;
197     totalValueBytes = 0;
198     maxValueLength = 0;
199     totalBytes = 0;
200   }
201 
202   /**
203    * Check that the arrays used to hold cell fragments are large enough for the cell that is being
204    * added. Since the PrefixTreeEncoder is cached between uses, these arrays may grow during the
205    * first few block encodings but should stabilize quickly.
206    */
207   protected void ensurePerCellCapacities() {
208     int currentCapacity = valueOffsets.length;
209     int neededCapacity = totalCells + 2;// some things write one index ahead. +2 to be safe
210     if (neededCapacity < currentCapacity) {
211       return;
212     }
213 
214     int padding = neededCapacity;//this will double the array size
215     timestamps = ArrayUtils.growIfNecessary(timestamps, neededCapacity, padding);
216     mvccVersions = ArrayUtils.growIfNecessary(mvccVersions, neededCapacity, padding);
217     typeBytes = ArrayUtils.growIfNecessary(typeBytes, neededCapacity, padding);
218     valueOffsets = ArrayUtils.growIfNecessary(valueOffsets, neededCapacity, padding);
219   }
220 
221   /******************** CellOutputStream methods *************************/
222 
223   /**
224    * Note: Unused until support is added to the scanner/heap
225    * <p/>
226    * The following method are optimized versions of write(Cell cell). The result should be
227    * identical, however the implementation may be able to execute them much more efficiently because
228    * it does not need to compare the unchanged fields with the previous cell's.
229    * <p/>
230    * Consider the benefits during compaction when paired with a CellScanner that is also aware of
231    * row boundaries. The CellScanner can easily use these methods instead of blindly passing Cells
232    * to the write(Cell cell) method.
233    * <p/>
234    * The savings of skipping duplicate row detection are significant with long row keys. A
235    * DataBlockEncoder may store a row key once in combination with a count of how many cells are in
236    * the row. With a 100 byte row key, we can replace 100 byte comparisons with a single increment
237    * of the counter, and that is for every cell in the row.
238    */
239 
240   /**
241    * Add a Cell to the output stream but repeat the previous row. 
242    */
243   //@Override
244   public void writeWithRepeatRow(Cell cell) {
245     ensurePerCellCapacities();//can we optimize away some of this?
246 
247     //save a relatively expensive row comparison, incrementing the row's counter instead
248     rowTokenizer.incrementNumOccurrencesOfLatestValue();
249     addFamilyPart(cell);
250     addQualifierPart(cell);
251     addAfterRowFamilyQualifier(cell);
252   }
253 
254 
255   @Override
256   public void write(Cell cell) {
257     ensurePerCellCapacities();
258 
259     rowTokenizer.addSorted(CellUtil.fillRowRange(cell, rowRange));
260     addFamilyPart(cell);
261     addQualifierPart(cell);
262     addAfterRowFamilyQualifier(cell);
263   }
264 
265 
266   /***************** internal add methods ************************/
267 
268   private void addAfterRowFamilyQualifier(Cell cell){
269     // timestamps
270     timestamps[totalCells] = cell.getTimestamp();
271     timestampEncoder.add(cell.getTimestamp());
272 
273     // memstore timestamps
274     if (includeMvccVersion) {
275       mvccVersions[totalCells] = cell.getMvccVersion();
276       mvccVersionEncoder.add(cell.getMvccVersion());
277       totalUnencodedBytes += WritableUtils.getVIntSize(cell.getMvccVersion());
278     }else{
279       //must overwrite in case there was a previous version in this array slot
280       mvccVersions[totalCells] = 0L;
281       if(totalCells == 0){//only need to do this for the first cell added
282         mvccVersionEncoder.add(0L);
283       }
284       //totalUncompressedBytes += 0;//mvccVersion takes zero bytes when disabled
285     }
286 
287     // types
288     typeBytes[totalCells] = cell.getTypeByte();
289     cellTypeEncoder.add(cell.getTypeByte());
290 
291     // values
292     totalValueBytes += cell.getValueLength();
293     // double the array each time we run out of space
294     values = ArrayUtils.growIfNecessary(values, totalValueBytes, 2 * totalValueBytes);
295     CellUtil.copyValueTo(cell, values, valueOffsets[totalCells]);
296     if (cell.getValueLength() > maxValueLength) {
297       maxValueLength = cell.getValueLength();
298     }
299     valueOffsets[totalCells + 1] = totalValueBytes;
300 
301     // general
302     totalUnencodedBytes += KeyValueUtil.length(cell);
303     ++totalCells;
304   }
305 
306   private void addFamilyPart(Cell cell) {
307     if (MULITPLE_FAMILIES_POSSIBLE || totalCells == 0) {
308       CellUtil.fillFamilyRange(cell, familyRange);
309       familyDeduplicator.add(familyRange);
310     }
311   }
312 
313   private void addQualifierPart(Cell cell) {
314     CellUtil.fillQualifierRange(cell, qualifierRange);
315     qualifierDeduplicator.add(qualifierRange);
316   }
317 
318 
319   /****************** compiling/flushing ********************/
320 
321   /**
322    * Expensive method.  The second half of the encoding work happens here.
323    *
324    * Take all the separate accumulated data structures and turn them into a single stream of bytes
325    * which is written to the outputStream.
326    */
327   @Override
328   public void flush() throws IOException {
329     compile();
330 
331     // do the actual flushing to the output stream.  Order matters.
332     blockMeta.writeVariableBytesToOutputStream(outputStream);
333     rowWriter.writeBytes(outputStream);
334     familyWriter.writeBytes(outputStream);
335     qualifierWriter.writeBytes(outputStream);
336     timestampEncoder.writeBytes(outputStream);
337     mvccVersionEncoder.writeBytes(outputStream);
338     //CellType bytes are in the row nodes.  there is no additional type section
339     outputStream.write(values, 0, totalValueBytes);
340   }
341 
342   /**
343    * Now that all the cells have been added, do the work to reduce them to a series of byte[]
344    * fragments that are ready to be written to the output stream.
345    */
346   protected void compile(){
347     blockMeta.setNumKeyValueBytes(totalUnencodedBytes);
348     int lastValueOffset = valueOffsets[totalCells];
349     blockMeta.setValueOffsetWidth(UFIntTool.numBytes(lastValueOffset));
350     blockMeta.setValueLengthWidth(UFIntTool.numBytes(maxValueLength));
351     blockMeta.setNumValueBytes(totalValueBytes);
352     totalBytes += totalValueBytes;
353 
354     //these compile methods will add to totalBytes
355     compileTypes();
356     compileMvccVersions();
357     compileTimestamps();
358     compileQualifiers();
359     compileFamilies();
360     compileRows();
361 
362     int numMetaBytes = blockMeta.calculateNumMetaBytes();
363     blockMeta.setNumMetaBytes(numMetaBytes);
364     totalBytes += numMetaBytes;
365   }
366 
367   /**
368    * The following "compile" methods do any intermediate work necessary to transform the cell
369    * fragments collected during the writing phase into structures that are ready to write to the
370    * outputStream.
371    * <p/>
372    * The family and qualifier treatment is almost identical, as is timestamp and mvccVersion.
373    */
374 
375   protected void compileTypes() {
376     blockMeta.setAllSameType(cellTypeEncoder.areAllSameType());
377     if(cellTypeEncoder.areAllSameType()){
378       blockMeta.setAllTypes(cellTypeEncoder.getOnlyType());
379     }
380   }
381 
382   protected void compileMvccVersions() {
383     mvccVersionEncoder.compile();
384     blockMeta.setMvccVersionFields(mvccVersionEncoder);
385     int numMvccVersionBytes = mvccVersionEncoder.getOutputArrayLength();
386     totalBytes += numMvccVersionBytes;
387   }
388 
389   protected void compileTimestamps() {
390     timestampEncoder.compile();
391     blockMeta.setTimestampFields(timestampEncoder);
392     int numTimestampBytes = timestampEncoder.getOutputArrayLength();
393     totalBytes += numTimestampBytes;
394   }
395 
396   protected void compileQualifiers() {
397     blockMeta.setNumUniqueQualifiers(qualifierDeduplicator.size());
398     qualifierDeduplicator.compile();
399     qualifierTokenizer.addAll(qualifierDeduplicator.getSortedRanges());
400     qualifierWriter.reconstruct(blockMeta, qualifierTokenizer, false);
401     qualifierWriter.compile();
402     int numQualifierBytes = qualifierWriter.getNumBytes();
403     blockMeta.setNumQualifierBytes(numQualifierBytes);
404     totalBytes += numQualifierBytes;
405   }
406 
407   protected void compileFamilies() {
408     blockMeta.setNumUniqueFamilies(familyDeduplicator.size());
409     familyDeduplicator.compile();
410     familyTokenizer.addAll(familyDeduplicator.getSortedRanges());
411     familyWriter.reconstruct(blockMeta, familyTokenizer, true);
412     familyWriter.compile();
413     int numFamilyBytes = familyWriter.getNumBytes();
414     blockMeta.setNumFamilyBytes(numFamilyBytes);
415     totalBytes += numFamilyBytes;
416   }
417 
418   protected void compileRows() {
419     rowWriter.reconstruct(this);
420     rowWriter.compile();
421     int numRowBytes = rowWriter.getNumBytes();
422     blockMeta.setNumRowBytes(numRowBytes);
423     blockMeta.setRowTreeDepth(rowTokenizer.getTreeDepth());
424     totalBytes += numRowBytes;
425   }
426 
427   /********************* convenience getters ********************************/
428 
429   public long getValueOffset(int index) {
430     return valueOffsets[index];
431   }
432 
433   public int getValueLength(int index) {
434     return (int) (valueOffsets[index + 1] - valueOffsets[index]);
435   }
436 
437   /************************* get/set *************************************/
438 
439   public PrefixTreeBlockMeta getBlockMeta() {
440     return blockMeta;
441   }
442 
443   public Tokenizer getRowTokenizer() {
444     return rowTokenizer;
445   }
446 
447   public LongEncoder getTimestampEncoder() {
448     return timestampEncoder;
449   }
450 
451   public int getTotalBytes() {
452     return totalBytes;
453   }
454 
455   public long[] getTimestamps() {
456     return timestamps;
457   }
458 
459   public long[] getMvccVersions() {
460     return mvccVersions;
461   }
462 
463   public byte[] getTypeBytes() {
464     return typeBytes;
465   }
466 
467   public LongEncoder getMvccVersionEncoder() {
468     return mvccVersionEncoder;
469   }
470 
471   public ByteRangeSet getFamilySorter() {
472     return familyDeduplicator;
473   }
474 
475   public ByteRangeSet getQualifierSorter() {
476     return qualifierDeduplicator;
477   }
478 
479   public ColumnSectionWriter getFamilyWriter() {
480     return familyWriter;
481   }
482 
483   public ColumnSectionWriter getQualifierWriter() {
484     return qualifierWriter;
485   }
486 
487   public RowSectionWriter getRowWriter() {
488     return rowWriter;
489   }
490 
491   public ByteRange getValueByteRange() {
492     return new SimpleByteRange(values, 0, totalValueBytes);
493   }
494 
495 }