1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.codec.prefixtree.encode;
20
21 import java.io.IOException;
22 import java.io.OutputStream;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26 import org.apache.hadoop.classification.InterfaceAudience;
27 import org.apache.hadoop.hbase.Cell;
28 import org.apache.hadoop.hbase.CellUtil;
29 import org.apache.hadoop.hbase.KeyValueUtil;
30 import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
31 import org.apache.hadoop.hbase.codec.prefixtree.encode.column.ColumnSectionWriter;
32 import org.apache.hadoop.hbase.codec.prefixtree.encode.other.CellTypeEncoder;
33 import org.apache.hadoop.hbase.codec.prefixtree.encode.other.LongEncoder;
34 import org.apache.hadoop.hbase.codec.prefixtree.encode.row.RowSectionWriter;
35 import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
36 import org.apache.hadoop.hbase.io.CellOutputStream;
37 import org.apache.hadoop.hbase.util.ArrayUtils;
38 import org.apache.hadoop.hbase.util.ByteRange;
39 import org.apache.hadoop.hbase.util.byterange.ByteRangeSet;
40 import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeHashSet;
41 import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeTreeSet;
42 import org.apache.hadoop.hbase.util.vint.UFIntTool;
43 import org.apache.hadoop.io.WritableUtils;
44
45
46
47
48
49
50
51
52
53
54
55 @InterfaceAudience.Private
56 public class PrefixTreeEncoder implements CellOutputStream {
57
58
59
60 protected static final Log LOG = LogFactory.getLog(PrefixTreeEncoder.class);
61
62
63 public static final boolean MULITPLE_FAMILIES_POSSIBLE = false;
64
65 private static final boolean USE_HASH_COLUMN_SORTER = true;
66 private static final int INITIAL_PER_CELL_ARRAY_SIZES = 256;
67 private static final int VALUE_BUFFER_INIT_SIZE = 64 * 1024;
68
69
70
71
72 protected long numResets = 0L;
73
74 protected OutputStream outputStream;
75
76
77
78
79
80 protected boolean includeMvccVersion;
81
82
83
84
85 protected ByteRange rowRange;
86 protected ByteRange familyRange;
87 protected ByteRange qualifierRange;
88
89
90
91
92 protected long[] timestamps;
93 protected long[] mvccVersions;
94 protected byte[] typeBytes;
95 protected int[] valueOffsets;
96 protected byte[] values;
97
98 protected PrefixTreeBlockMeta blockMeta;
99
100
101
102
103
104 protected LongEncoder timestampEncoder;
105 protected LongEncoder mvccVersionEncoder;
106 protected CellTypeEncoder cellTypeEncoder;
107
108
109
110
111
112
113
114 protected ByteRangeSet familyDeduplicator;
115 protected ByteRangeSet qualifierDeduplicator;
116
117
118
119
120
121 protected Tokenizer rowTokenizer;
122 protected Tokenizer familyTokenizer;
123 protected Tokenizer qualifierTokenizer;
124
125
126
127
128
129 protected RowSectionWriter rowWriter;
130 protected ColumnSectionWriter familyWriter;
131 protected ColumnSectionWriter qualifierWriter;
132
133
134
135
136
137 protected int totalCells = 0;
138 protected int totalUnencodedBytes = 0;
139 protected int totalValueBytes = 0;
140 protected int maxValueLength = 0;
141 protected int totalBytes = 0;
142
143
144
145
146 public PrefixTreeEncoder(OutputStream outputStream, boolean includeMvccVersion) {
147
148 this.blockMeta = new PrefixTreeBlockMeta();
149 this.rowRange = new ByteRange();
150 this.familyRange = new ByteRange();
151 this.qualifierRange = new ByteRange();
152 this.timestamps = new long[INITIAL_PER_CELL_ARRAY_SIZES];
153 this.mvccVersions = new long[INITIAL_PER_CELL_ARRAY_SIZES];
154 this.typeBytes = new byte[INITIAL_PER_CELL_ARRAY_SIZES];
155 this.valueOffsets = new int[INITIAL_PER_CELL_ARRAY_SIZES];
156 this.values = new byte[VALUE_BUFFER_INIT_SIZE];
157
158
159 this.familyDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
160 : new ByteRangeTreeSet();
161 this.qualifierDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
162 : new ByteRangeTreeSet();
163 this.timestampEncoder = new LongEncoder();
164 this.mvccVersionEncoder = new LongEncoder();
165 this.cellTypeEncoder = new CellTypeEncoder();
166 this.rowTokenizer = new Tokenizer();
167 this.familyTokenizer = new Tokenizer();
168 this.qualifierTokenizer = new Tokenizer();
169 this.rowWriter = new RowSectionWriter();
170 this.familyWriter = new ColumnSectionWriter();
171 this.qualifierWriter = new ColumnSectionWriter();
172
173 reset(outputStream, includeMvccVersion);
174 }
175
176 public void reset(OutputStream outputStream, boolean includeMvccVersion) {
177 ++numResets;
178 this.includeMvccVersion = includeMvccVersion;
179 this.outputStream = outputStream;
180 valueOffsets[0] = 0;
181
182 familyDeduplicator.reset();
183 qualifierDeduplicator.reset();
184 rowTokenizer.reset();
185 timestampEncoder.reset();
186 mvccVersionEncoder.reset();
187 cellTypeEncoder.reset();
188 familyTokenizer.reset();
189 qualifierTokenizer.reset();
190 rowWriter.reset();
191 familyWriter.reset();
192 qualifierWriter.reset();
193
194 totalCells = 0;
195 totalUnencodedBytes = 0;
196 totalValueBytes = 0;
197 maxValueLength = 0;
198 totalBytes = 0;
199 }
200
201
202
203
204
205
206 protected void ensurePerCellCapacities() {
207 int currentCapacity = valueOffsets.length;
208 int neededCapacity = totalCells + 2;
209 if (neededCapacity < currentCapacity) {
210 return;
211 }
212
213 int padding = neededCapacity;
214 timestamps = ArrayUtils.growIfNecessary(timestamps, neededCapacity, padding);
215 mvccVersions = ArrayUtils.growIfNecessary(mvccVersions, neededCapacity, padding);
216 typeBytes = ArrayUtils.growIfNecessary(typeBytes, neededCapacity, padding);
217 valueOffsets = ArrayUtils.growIfNecessary(valueOffsets, neededCapacity, padding);
218 }
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243 public void writeWithRepeatRow(Cell cell) {
244 ensurePerCellCapacities();
245
246
247 rowTokenizer.incrementNumOccurrencesOfLatestValue();
248 addFamilyPart(cell);
249 addQualifierPart(cell);
250 addAfterRowFamilyQualifier(cell);
251 }
252
253
254 @Override
255 public void write(Cell cell) {
256 ensurePerCellCapacities();
257
258 rowTokenizer.addSorted(CellUtil.fillRowRange(cell, rowRange));
259 addFamilyPart(cell);
260 addQualifierPart(cell);
261 addAfterRowFamilyQualifier(cell);
262 }
263
264
265
266
267 private void addAfterRowFamilyQualifier(Cell cell){
268
269 timestamps[totalCells] = cell.getTimestamp();
270 timestampEncoder.add(cell.getTimestamp());
271
272
273 if (includeMvccVersion) {
274 mvccVersions[totalCells] = cell.getMvccVersion();
275 mvccVersionEncoder.add(cell.getMvccVersion());
276 totalUnencodedBytes += WritableUtils.getVIntSize(cell.getMvccVersion());
277 }else{
278
279 mvccVersions[totalCells] = 0L;
280 if(totalCells == 0){
281 mvccVersionEncoder.add(0L);
282 }
283
284 }
285
286
287 typeBytes[totalCells] = cell.getTypeByte();
288 cellTypeEncoder.add(cell.getTypeByte());
289
290
291 totalValueBytes += cell.getValueLength();
292
293 values = ArrayUtils.growIfNecessary(values, totalValueBytes, 2 * totalValueBytes);
294 CellUtil.copyValueTo(cell, values, valueOffsets[totalCells]);
295 if (cell.getValueLength() > maxValueLength) {
296 maxValueLength = cell.getValueLength();
297 }
298 valueOffsets[totalCells + 1] = totalValueBytes;
299
300
301 totalUnencodedBytes += KeyValueUtil.length(cell);
302 ++totalCells;
303 }
304
305 private void addFamilyPart(Cell cell) {
306 if (MULITPLE_FAMILIES_POSSIBLE || totalCells == 0) {
307 CellUtil.fillFamilyRange(cell, familyRange);
308 familyDeduplicator.add(familyRange);
309 }
310 }
311
312 private void addQualifierPart(Cell cell) {
313 CellUtil.fillQualifierRange(cell, qualifierRange);
314 qualifierDeduplicator.add(qualifierRange);
315 }
316
317
318
319
320
321
322
323
324
325
326 @Override
327 public void flush() throws IOException {
328 compile();
329
330
331 blockMeta.writeVariableBytesToOutputStream(outputStream);
332 rowWriter.writeBytes(outputStream);
333 familyWriter.writeBytes(outputStream);
334 qualifierWriter.writeBytes(outputStream);
335 timestampEncoder.writeBytes(outputStream);
336 mvccVersionEncoder.writeBytes(outputStream);
337
338 outputStream.write(values, 0, totalValueBytes);
339 }
340
341
342
343
344
345 protected void compile(){
346 blockMeta.setNumKeyValueBytes(totalUnencodedBytes);
347 int lastValueOffset = valueOffsets[totalCells];
348 blockMeta.setValueOffsetWidth(UFIntTool.numBytes(lastValueOffset));
349 blockMeta.setValueLengthWidth(UFIntTool.numBytes(maxValueLength));
350 blockMeta.setNumValueBytes(totalValueBytes);
351 totalBytes += totalValueBytes;
352
353
354 compileTypes();
355 compileMvccVersions();
356 compileTimestamps();
357 compileQualifiers();
358 compileFamilies();
359 compileRows();
360
361 int numMetaBytes = blockMeta.calculateNumMetaBytes();
362 blockMeta.setNumMetaBytes(numMetaBytes);
363 totalBytes += numMetaBytes;
364 }
365
366
367
368
369
370
371
372
373
374 protected void compileTypes() {
375 blockMeta.setAllSameType(cellTypeEncoder.areAllSameType());
376 if(cellTypeEncoder.areAllSameType()){
377 blockMeta.setAllTypes(cellTypeEncoder.getOnlyType());
378 }
379 }
380
381 protected void compileMvccVersions() {
382 mvccVersionEncoder.compile();
383 blockMeta.setMvccVersionFields(mvccVersionEncoder);
384 int numMvccVersionBytes = mvccVersionEncoder.getOutputArrayLength();
385 totalBytes += numMvccVersionBytes;
386 }
387
388 protected void compileTimestamps() {
389 timestampEncoder.compile();
390 blockMeta.setTimestampFields(timestampEncoder);
391 int numTimestampBytes = timestampEncoder.getOutputArrayLength();
392 totalBytes += numTimestampBytes;
393 }
394
395 protected void compileQualifiers() {
396 blockMeta.setNumUniqueQualifiers(qualifierDeduplicator.size());
397 qualifierDeduplicator.compile();
398 qualifierTokenizer.addAll(qualifierDeduplicator.getSortedRanges());
399 qualifierWriter.reconstruct(blockMeta, qualifierTokenizer, false);
400 qualifierWriter.compile();
401 int numQualifierBytes = qualifierWriter.getNumBytes();
402 blockMeta.setNumQualifierBytes(numQualifierBytes);
403 totalBytes += numQualifierBytes;
404 }
405
406 protected void compileFamilies() {
407 blockMeta.setNumUniqueFamilies(familyDeduplicator.size());
408 familyDeduplicator.compile();
409 familyTokenizer.addAll(familyDeduplicator.getSortedRanges());
410 familyWriter.reconstruct(blockMeta, familyTokenizer, true);
411 familyWriter.compile();
412 int numFamilyBytes = familyWriter.getNumBytes();
413 blockMeta.setNumFamilyBytes(numFamilyBytes);
414 totalBytes += numFamilyBytes;
415 }
416
417 protected void compileRows() {
418 rowWriter.reconstruct(this);
419 rowWriter.compile();
420 int numRowBytes = rowWriter.getNumBytes();
421 blockMeta.setNumRowBytes(numRowBytes);
422 blockMeta.setRowTreeDepth(rowTokenizer.getTreeDepth());
423 totalBytes += numRowBytes;
424 }
425
426
427
428 public long getValueOffset(int index) {
429 return valueOffsets[index];
430 }
431
432 public int getValueLength(int index) {
433 return (int) (valueOffsets[index + 1] - valueOffsets[index]);
434 }
435
436
437
438 public PrefixTreeBlockMeta getBlockMeta() {
439 return blockMeta;
440 }
441
442 public Tokenizer getRowTokenizer() {
443 return rowTokenizer;
444 }
445
446 public LongEncoder getTimestampEncoder() {
447 return timestampEncoder;
448 }
449
450 public int getTotalBytes() {
451 return totalBytes;
452 }
453
454 public long[] getTimestamps() {
455 return timestamps;
456 }
457
458 public long[] getMvccVersions() {
459 return mvccVersions;
460 }
461
462 public byte[] getTypeBytes() {
463 return typeBytes;
464 }
465
466 public LongEncoder getMvccVersionEncoder() {
467 return mvccVersionEncoder;
468 }
469
470 public ByteRangeSet getFamilySorter() {
471 return familyDeduplicator;
472 }
473
474 public ByteRangeSet getQualifierSorter() {
475 return qualifierDeduplicator;
476 }
477
478 public ColumnSectionWriter getFamilyWriter() {
479 return familyWriter;
480 }
481
482 public ColumnSectionWriter getQualifierWriter() {
483 return qualifierWriter;
484 }
485
486 public RowSectionWriter getRowWriter() {
487 return rowWriter;
488 }
489
490 public ByteRange getValueByteRange() {
491 return new ByteRange(values, 0, totalValueBytes);
492 }
493
494 }