View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.codec.prefixtree.encode;
20  
21  import java.io.IOException;
22  import java.io.OutputStream;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.Cell;
28  import org.apache.hadoop.hbase.CellUtil;
29  import org.apache.hadoop.hbase.KeyValueUtil;
30  import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
31  import org.apache.hadoop.hbase.codec.prefixtree.encode.column.ColumnSectionWriter;
32  import org.apache.hadoop.hbase.codec.prefixtree.encode.other.CellTypeEncoder;
33  import org.apache.hadoop.hbase.codec.prefixtree.encode.other.LongEncoder;
34  import org.apache.hadoop.hbase.codec.prefixtree.encode.row.RowSectionWriter;
35  import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
36  import org.apache.hadoop.hbase.io.CellOutputStream;
37  import org.apache.hadoop.hbase.util.ArrayUtils;
38  import org.apache.hadoop.hbase.util.ByteRange;
39  import org.apache.hadoop.hbase.util.byterange.ByteRangeSet;
40  import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeHashSet;
41  import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeTreeSet;
42  import org.apache.hadoop.hbase.util.vint.UFIntTool;
43  import org.apache.hadoop.io.WritableUtils;
44  
45  /**
46   * This is the primary class for converting a CellOutputStream into an encoded byte[]. As Cells are
47   * added they are completely copied into the various encoding structures. This is important because
48   * usually the cells being fed in during compactions will be transient.<br/>
49   * <br/>
50   * Usage:<br/>
51   * 1) constructor<br/>
52   * 4) append cells in sorted order: write(Cell cell)<br/>
53   * 5) flush()<br/>
54   */
55  @InterfaceAudience.Private
56  public class PrefixTreeEncoder implements CellOutputStream {
57  
58    /**************** static ************************/
59  
60    protected static final Log LOG = LogFactory.getLog(PrefixTreeEncoder.class);
61  
62    //future-proof where HBase supports multiple families in a data block.
63    public static final boolean MULITPLE_FAMILIES_POSSIBLE = false;
64  
65    private static final boolean USE_HASH_COLUMN_SORTER = true;
66    private static final int INITIAL_PER_CELL_ARRAY_SIZES = 256;
67    private static final int VALUE_BUFFER_INIT_SIZE = 64 * 1024;
68  
69  
70    /**************** fields *************************/
71  
72    protected long numResets = 0L;
73  
74    protected OutputStream outputStream;
75  
76    /*
77     * Cannot change during a single block's encoding. If false, then substitute incoming Cell's
78     * mvccVersion with zero and write out the block as usual.
79     */
80    protected boolean includeMvccVersion;
81  
82    /*
83     * reusable ByteRanges used for communicating with the sorters/compilers
84     */
85    protected ByteRange rowRange;
86    protected ByteRange familyRange;
87    protected ByteRange qualifierRange;
88  
89    /*
90     * incoming Cell fields are copied into these arrays
91     */
92    protected long[] timestamps;
93    protected long[] mvccVersions;
94    protected byte[] typeBytes;
95    protected int[] valueOffsets;
96    protected byte[] values;
97  
98    protected PrefixTreeBlockMeta blockMeta;
99  
100   /*
101    * Sub-encoders for the simple long/byte fields of a Cell.  Add to these as each cell arrives and
102    * compile before flushing.
103    */
104   protected LongEncoder timestampEncoder;
105   protected LongEncoder mvccVersionEncoder;
106   protected CellTypeEncoder cellTypeEncoder;
107 
108   /*
109    * Structures used for collecting families and qualifiers, de-duplicating them, and sorting them
110    * so they can be passed to the tokenizers. Unlike row keys where we can detect duplicates by
111    * comparing only with the previous row key, families and qualifiers can arrive in unsorted order
112    * in blocks spanning multiple rows. We must collect them all into a set to de-duplicate them.
113    */
114   protected ByteRangeSet familyDeduplicator;
115   protected ByteRangeSet qualifierDeduplicator;
116 
117   /*
118    * Feed sorted byte[]s into these tokenizers which will convert the byte[]s to an in-memory
119    * trie structure with nodes connected by memory pointers (not serializable yet).
120    */
121   protected Tokenizer rowTokenizer;
122   protected Tokenizer familyTokenizer;
123   protected Tokenizer qualifierTokenizer;
124 
125   /*
126    * Writers take an in-memory trie, sort the nodes, calculate offsets and lengths, and write
127    * all information to an output stream of bytes that can be stored on disk.
128    */
129   protected RowSectionWriter rowWriter;
130   protected ColumnSectionWriter familyWriter;
131   protected ColumnSectionWriter qualifierWriter;
132 
133   /*
134    * Integers used for counting cells and bytes.  We keep track of the size of the Cells as if they
135    * were full KeyValues because some parts of HBase like to know the "unencoded size".
136    */
137   protected int totalCells = 0;
138   protected int totalUnencodedBytes = 0;//numBytes if the cells were KeyValues
139   protected int totalValueBytes = 0;
140   protected int maxValueLength = 0;
141   protected int totalBytes = 0;//
142 
143 
144   /***************** construct ***********************/
145 
146   public PrefixTreeEncoder(OutputStream outputStream, boolean includeMvccVersion) {
147     // used during cell accumulation
148     this.blockMeta = new PrefixTreeBlockMeta();
149     this.rowRange = new ByteRange();
150     this.familyRange = new ByteRange();
151     this.qualifierRange = new ByteRange();
152     this.timestamps = new long[INITIAL_PER_CELL_ARRAY_SIZES];
153     this.mvccVersions = new long[INITIAL_PER_CELL_ARRAY_SIZES];
154     this.typeBytes = new byte[INITIAL_PER_CELL_ARRAY_SIZES];
155     this.valueOffsets = new int[INITIAL_PER_CELL_ARRAY_SIZES];
156     this.values = new byte[VALUE_BUFFER_INIT_SIZE];
157 
158     // used during compilation
159     this.familyDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
160         : new ByteRangeTreeSet();
161     this.qualifierDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
162         : new ByteRangeTreeSet();
163     this.timestampEncoder = new LongEncoder();
164     this.mvccVersionEncoder = new LongEncoder();
165     this.cellTypeEncoder = new CellTypeEncoder();
166     this.rowTokenizer = new Tokenizer();
167     this.familyTokenizer = new Tokenizer();
168     this.qualifierTokenizer = new Tokenizer();
169     this.rowWriter = new RowSectionWriter();
170     this.familyWriter = new ColumnSectionWriter();
171     this.qualifierWriter = new ColumnSectionWriter();
172 
173     reset(outputStream, includeMvccVersion);
174   }
175 
176   public void reset(OutputStream outputStream, boolean includeMvccVersion) {
177     ++numResets;
178     this.includeMvccVersion = includeMvccVersion;
179     this.outputStream = outputStream;
180     valueOffsets[0] = 0;
181 
182     familyDeduplicator.reset();
183     qualifierDeduplicator.reset();
184     rowTokenizer.reset();
185     timestampEncoder.reset();
186     mvccVersionEncoder.reset();
187     cellTypeEncoder.reset();
188     familyTokenizer.reset();
189     qualifierTokenizer.reset();
190     rowWriter.reset();
191     familyWriter.reset();
192     qualifierWriter.reset();
193 
194     totalCells = 0;
195     totalUnencodedBytes = 0;
196     totalValueBytes = 0;
197     maxValueLength = 0;
198     totalBytes = 0;
199   }
200 
201   /**
202    * Check that the arrays used to hold cell fragments are large enough for the cell that is being
203    * added. Since the PrefixTreeEncoder is cached between uses, these arrays may grow during the
204    * first few block encodings but should stabilize quickly.
205    */
206   protected void ensurePerCellCapacities() {
207     int currentCapacity = valueOffsets.length;
208     int neededCapacity = totalCells + 2;// some things write one index ahead. +2 to be safe
209     if (neededCapacity < currentCapacity) {
210       return;
211     }
212 
213     int padding = neededCapacity;//this will double the array size
214     timestamps = ArrayUtils.growIfNecessary(timestamps, neededCapacity, padding);
215     mvccVersions = ArrayUtils.growIfNecessary(mvccVersions, neededCapacity, padding);
216     typeBytes = ArrayUtils.growIfNecessary(typeBytes, neededCapacity, padding);
217     valueOffsets = ArrayUtils.growIfNecessary(valueOffsets, neededCapacity, padding);
218   }
219 
220   /******************** CellOutputStream methods *************************/
221 
222   /**
223    * Note: Unused until support is added to the scanner/heap
224    * <p/>
225    * The following method are optimized versions of write(Cell cell). The result should be
226    * identical, however the implementation may be able to execute them much more efficiently because
227    * it does not need to compare the unchanged fields with the previous cell's.
228    * <p/>
229    * Consider the benefits during compaction when paired with a CellScanner that is also aware of
230    * row boundaries. The CellScanner can easily use these methods instead of blindly passing Cells
231    * to the write(Cell cell) method.
232    * <p/>
233    * The savings of skipping duplicate row detection are significant with long row keys. A
234    * DataBlockEncoder may store a row key once in combination with a count of how many cells are in
235    * the row. With a 100 byte row key, we can replace 100 byte comparisons with a single increment
236    * of the counter, and that is for every cell in the row.
237    */
238 
239   /**
240    * Add a Cell to the output stream but repeat the previous row. 
241    */
242   //@Override
243   public void writeWithRepeatRow(Cell cell) {
244     ensurePerCellCapacities();//can we optimize away some of this?
245 
246     //save a relatively expensive row comparison, incrementing the row's counter instead
247     rowTokenizer.incrementNumOccurrencesOfLatestValue();
248     addFamilyPart(cell);
249     addQualifierPart(cell);
250     addAfterRowFamilyQualifier(cell);
251   }
252 
253 
254   @Override
255   public void write(Cell cell) {
256     ensurePerCellCapacities();
257 
258     rowTokenizer.addSorted(CellUtil.fillRowRange(cell, rowRange));
259     addFamilyPart(cell);
260     addQualifierPart(cell);
261     addAfterRowFamilyQualifier(cell);
262   }
263 
264 
265   /***************** internal add methods ************************/
266 
267   private void addAfterRowFamilyQualifier(Cell cell){
268     // timestamps
269     timestamps[totalCells] = cell.getTimestamp();
270     timestampEncoder.add(cell.getTimestamp());
271 
272     // memstore timestamps
273     if (includeMvccVersion) {
274       mvccVersions[totalCells] = cell.getMvccVersion();
275       mvccVersionEncoder.add(cell.getMvccVersion());
276       totalUnencodedBytes += WritableUtils.getVIntSize(cell.getMvccVersion());
277     }else{
278       //must overwrite in case there was a previous version in this array slot
279       mvccVersions[totalCells] = 0L;
280       if(totalCells == 0){//only need to do this for the first cell added
281         mvccVersionEncoder.add(0L);
282       }
283       //totalUncompressedBytes += 0;//mvccVersion takes zero bytes when disabled
284     }
285 
286     // types
287     typeBytes[totalCells] = cell.getTypeByte();
288     cellTypeEncoder.add(cell.getTypeByte());
289 
290     // values
291     totalValueBytes += cell.getValueLength();
292     // double the array each time we run out of space
293     values = ArrayUtils.growIfNecessary(values, totalValueBytes, 2 * totalValueBytes);
294     CellUtil.copyValueTo(cell, values, valueOffsets[totalCells]);
295     if (cell.getValueLength() > maxValueLength) {
296       maxValueLength = cell.getValueLength();
297     }
298     valueOffsets[totalCells + 1] = totalValueBytes;
299 
300     // general
301     totalUnencodedBytes += KeyValueUtil.length(cell);
302     ++totalCells;
303   }
304 
305   private void addFamilyPart(Cell cell) {
306     if (MULITPLE_FAMILIES_POSSIBLE || totalCells == 0) {
307       CellUtil.fillFamilyRange(cell, familyRange);
308       familyDeduplicator.add(familyRange);
309     }
310   }
311 
312   private void addQualifierPart(Cell cell) {
313     CellUtil.fillQualifierRange(cell, qualifierRange);
314     qualifierDeduplicator.add(qualifierRange);
315   }
316 
317 
318   /****************** compiling/flushing ********************/
319 
320   /**
321    * Expensive method.  The second half of the encoding work happens here.
322    *
323    * Take all the separate accumulated data structures and turn them into a single stream of bytes
324    * which is written to the outputStream.
325    */
326   @Override
327   public void flush() throws IOException {
328     compile();
329 
330     // do the actual flushing to the output stream.  Order matters.
331     blockMeta.writeVariableBytesToOutputStream(outputStream);
332     rowWriter.writeBytes(outputStream);
333     familyWriter.writeBytes(outputStream);
334     qualifierWriter.writeBytes(outputStream);
335     timestampEncoder.writeBytes(outputStream);
336     mvccVersionEncoder.writeBytes(outputStream);
337     //CellType bytes are in the row nodes.  there is no additional type section
338     outputStream.write(values, 0, totalValueBytes);
339   }
340 
341   /**
342    * Now that all the cells have been added, do the work to reduce them to a series of byte[]
343    * fragments that are ready to be written to the output stream.
344    */
345   protected void compile(){
346     blockMeta.setNumKeyValueBytes(totalUnencodedBytes);
347     int lastValueOffset = valueOffsets[totalCells];
348     blockMeta.setValueOffsetWidth(UFIntTool.numBytes(lastValueOffset));
349     blockMeta.setValueLengthWidth(UFIntTool.numBytes(maxValueLength));
350     blockMeta.setNumValueBytes(totalValueBytes);
351     totalBytes += totalValueBytes;
352 
353     //these compile methods will add to totalBytes
354     compileTypes();
355     compileMvccVersions();
356     compileTimestamps();
357     compileQualifiers();
358     compileFamilies();
359     compileRows();
360 
361     int numMetaBytes = blockMeta.calculateNumMetaBytes();
362     blockMeta.setNumMetaBytes(numMetaBytes);
363     totalBytes += numMetaBytes;
364   }
365 
366   /**
367    * The following "compile" methods do any intermediate work necessary to transform the cell
368    * fragments collected during the writing phase into structures that are ready to write to the
369    * outputStream.
370    * <p/>
371    * The family and qualifier treatment is almost identical, as is timestamp and mvccVersion.
372    */
373 
374   protected void compileTypes() {
375     blockMeta.setAllSameType(cellTypeEncoder.areAllSameType());
376     if(cellTypeEncoder.areAllSameType()){
377       blockMeta.setAllTypes(cellTypeEncoder.getOnlyType());
378     }
379   }
380 
381   protected void compileMvccVersions() {
382     mvccVersionEncoder.compile();
383     blockMeta.setMvccVersionFields(mvccVersionEncoder);
384     int numMvccVersionBytes = mvccVersionEncoder.getOutputArrayLength();
385     totalBytes += numMvccVersionBytes;
386   }
387 
388   protected void compileTimestamps() {
389     timestampEncoder.compile();
390     blockMeta.setTimestampFields(timestampEncoder);
391     int numTimestampBytes = timestampEncoder.getOutputArrayLength();
392     totalBytes += numTimestampBytes;
393   }
394 
395   protected void compileQualifiers() {
396     blockMeta.setNumUniqueQualifiers(qualifierDeduplicator.size());
397     qualifierDeduplicator.compile();
398     qualifierTokenizer.addAll(qualifierDeduplicator.getSortedRanges());
399     qualifierWriter.reconstruct(blockMeta, qualifierTokenizer, false);
400     qualifierWriter.compile();
401     int numQualifierBytes = qualifierWriter.getNumBytes();
402     blockMeta.setNumQualifierBytes(numQualifierBytes);
403     totalBytes += numQualifierBytes;
404   }
405 
406   protected void compileFamilies() {
407     blockMeta.setNumUniqueFamilies(familyDeduplicator.size());
408     familyDeduplicator.compile();
409     familyTokenizer.addAll(familyDeduplicator.getSortedRanges());
410     familyWriter.reconstruct(blockMeta, familyTokenizer, true);
411     familyWriter.compile();
412     int numFamilyBytes = familyWriter.getNumBytes();
413     blockMeta.setNumFamilyBytes(numFamilyBytes);
414     totalBytes += numFamilyBytes;
415   }
416 
417   protected void compileRows() {
418     rowWriter.reconstruct(this);
419     rowWriter.compile();
420     int numRowBytes = rowWriter.getNumBytes();
421     blockMeta.setNumRowBytes(numRowBytes);
422     blockMeta.setRowTreeDepth(rowTokenizer.getTreeDepth());
423     totalBytes += numRowBytes;
424   }
425 
426   /********************* convenience getters ********************************/
427 
428   public long getValueOffset(int index) {
429     return valueOffsets[index];
430   }
431 
432   public int getValueLength(int index) {
433     return (int) (valueOffsets[index + 1] - valueOffsets[index]);
434   }
435 
436   /************************* get/set *************************************/
437 
438   public PrefixTreeBlockMeta getBlockMeta() {
439     return blockMeta;
440   }
441 
442   public Tokenizer getRowTokenizer() {
443     return rowTokenizer;
444   }
445 
446   public LongEncoder getTimestampEncoder() {
447     return timestampEncoder;
448   }
449 
450   public int getTotalBytes() {
451     return totalBytes;
452   }
453 
454   public long[] getTimestamps() {
455     return timestamps;
456   }
457 
458   public long[] getMvccVersions() {
459     return mvccVersions;
460   }
461 
462   public byte[] getTypeBytes() {
463     return typeBytes;
464   }
465 
466   public LongEncoder getMvccVersionEncoder() {
467     return mvccVersionEncoder;
468   }
469 
470   public ByteRangeSet getFamilySorter() {
471     return familyDeduplicator;
472   }
473 
474   public ByteRangeSet getQualifierSorter() {
475     return qualifierDeduplicator;
476   }
477 
478   public ColumnSectionWriter getFamilyWriter() {
479     return familyWriter;
480   }
481 
482   public ColumnSectionWriter getQualifierWriter() {
483     return qualifierWriter;
484   }
485 
486   public RowSectionWriter getRowWriter() {
487     return rowWriter;
488   }
489 
490   public ByteRange getValueByteRange() {
491     return new ByteRange(values, 0, totalValueBytes);
492   }
493 
494 }