1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.codec.prefixtree.encode;
20
21 import java.io.IOException;
22 import java.io.OutputStream;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26 import org.apache.hadoop.classification.InterfaceAudience;
27 import org.apache.hadoop.hbase.Cell;
28 import org.apache.hadoop.hbase.CellUtil;
29 import org.apache.hadoop.hbase.KeyValueUtil;
30 import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
31 import org.apache.hadoop.hbase.codec.prefixtree.encode.column.ColumnSectionWriter;
32 import org.apache.hadoop.hbase.codec.prefixtree.encode.other.CellTypeEncoder;
33 import org.apache.hadoop.hbase.codec.prefixtree.encode.other.LongEncoder;
34 import org.apache.hadoop.hbase.codec.prefixtree.encode.row.RowSectionWriter;
35 import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
36 import org.apache.hadoop.hbase.io.CellOutputStream;
37 import org.apache.hadoop.hbase.util.ArrayUtils;
38 import org.apache.hadoop.hbase.util.ByteRange;
39 import org.apache.hadoop.hbase.util.SimpleByteRange;
40 import org.apache.hadoop.hbase.util.byterange.ByteRangeSet;
41 import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeHashSet;
42 import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeTreeSet;
43 import org.apache.hadoop.hbase.util.vint.UFIntTool;
44 import org.apache.hadoop.io.WritableUtils;
45
46
47
48
49
50
51
52
53
54
55
56 @InterfaceAudience.Private
57 public class PrefixTreeEncoder implements CellOutputStream {
58
59
60
61 protected static final Log LOG = LogFactory.getLog(PrefixTreeEncoder.class);
62
63
64 public static final boolean MULITPLE_FAMILIES_POSSIBLE = false;
65
66 private static final boolean USE_HASH_COLUMN_SORTER = true;
67 private static final int INITIAL_PER_CELL_ARRAY_SIZES = 256;
68 private static final int VALUE_BUFFER_INIT_SIZE = 64 * 1024;
69
70
71
72
73 protected long numResets = 0L;
74
75 protected OutputStream outputStream;
76
77
78
79
80
81 protected boolean includeMvccVersion;
82
83
84
85
86 protected ByteRange rowRange;
87 protected ByteRange familyRange;
88 protected ByteRange qualifierRange;
89
90
91
92
93 protected long[] timestamps;
94 protected long[] mvccVersions;
95 protected byte[] typeBytes;
96 protected int[] valueOffsets;
97 protected byte[] values;
98
99 protected PrefixTreeBlockMeta blockMeta;
100
101
102
103
104
105 protected LongEncoder timestampEncoder;
106 protected LongEncoder mvccVersionEncoder;
107 protected CellTypeEncoder cellTypeEncoder;
108
109
110
111
112
113
114
115 protected ByteRangeSet familyDeduplicator;
116 protected ByteRangeSet qualifierDeduplicator;
117
118
119
120
121
122 protected Tokenizer rowTokenizer;
123 protected Tokenizer familyTokenizer;
124 protected Tokenizer qualifierTokenizer;
125
126
127
128
129
130 protected RowSectionWriter rowWriter;
131 protected ColumnSectionWriter familyWriter;
132 protected ColumnSectionWriter qualifierWriter;
133
134
135
136
137
138 protected int totalCells = 0;
139 protected int totalUnencodedBytes = 0;
140 protected int totalValueBytes = 0;
141 protected int maxValueLength = 0;
142 protected int totalBytes = 0;
143
144
145
146
147 public PrefixTreeEncoder(OutputStream outputStream, boolean includeMvccVersion) {
148
149 this.blockMeta = new PrefixTreeBlockMeta();
150 this.rowRange = new SimpleByteRange();
151 this.familyRange = new SimpleByteRange();
152 this.qualifierRange = new SimpleByteRange();
153 this.timestamps = new long[INITIAL_PER_CELL_ARRAY_SIZES];
154 this.mvccVersions = new long[INITIAL_PER_CELL_ARRAY_SIZES];
155 this.typeBytes = new byte[INITIAL_PER_CELL_ARRAY_SIZES];
156 this.valueOffsets = new int[INITIAL_PER_CELL_ARRAY_SIZES];
157 this.values = new byte[VALUE_BUFFER_INIT_SIZE];
158
159
160 this.familyDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
161 : new ByteRangeTreeSet();
162 this.qualifierDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
163 : new ByteRangeTreeSet();
164 this.timestampEncoder = new LongEncoder();
165 this.mvccVersionEncoder = new LongEncoder();
166 this.cellTypeEncoder = new CellTypeEncoder();
167 this.rowTokenizer = new Tokenizer();
168 this.familyTokenizer = new Tokenizer();
169 this.qualifierTokenizer = new Tokenizer();
170 this.rowWriter = new RowSectionWriter();
171 this.familyWriter = new ColumnSectionWriter();
172 this.qualifierWriter = new ColumnSectionWriter();
173
174 reset(outputStream, includeMvccVersion);
175 }
176
177 public void reset(OutputStream outputStream, boolean includeMvccVersion) {
178 ++numResets;
179 this.includeMvccVersion = includeMvccVersion;
180 this.outputStream = outputStream;
181 valueOffsets[0] = 0;
182
183 familyDeduplicator.reset();
184 qualifierDeduplicator.reset();
185 rowTokenizer.reset();
186 timestampEncoder.reset();
187 mvccVersionEncoder.reset();
188 cellTypeEncoder.reset();
189 familyTokenizer.reset();
190 qualifierTokenizer.reset();
191 rowWriter.reset();
192 familyWriter.reset();
193 qualifierWriter.reset();
194
195 totalCells = 0;
196 totalUnencodedBytes = 0;
197 totalValueBytes = 0;
198 maxValueLength = 0;
199 totalBytes = 0;
200 }
201
202
203
204
205
206
207 protected void ensurePerCellCapacities() {
208 int currentCapacity = valueOffsets.length;
209 int neededCapacity = totalCells + 2;
210 if (neededCapacity < currentCapacity) {
211 return;
212 }
213
214 int padding = neededCapacity;
215 timestamps = ArrayUtils.growIfNecessary(timestamps, neededCapacity, padding);
216 mvccVersions = ArrayUtils.growIfNecessary(mvccVersions, neededCapacity, padding);
217 typeBytes = ArrayUtils.growIfNecessary(typeBytes, neededCapacity, padding);
218 valueOffsets = ArrayUtils.growIfNecessary(valueOffsets, neededCapacity, padding);
219 }
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244 public void writeWithRepeatRow(Cell cell) {
245 ensurePerCellCapacities();
246
247
248 rowTokenizer.incrementNumOccurrencesOfLatestValue();
249 addFamilyPart(cell);
250 addQualifierPart(cell);
251 addAfterRowFamilyQualifier(cell);
252 }
253
254
255 @Override
256 public void write(Cell cell) {
257 ensurePerCellCapacities();
258
259 rowTokenizer.addSorted(CellUtil.fillRowRange(cell, rowRange));
260 addFamilyPart(cell);
261 addQualifierPart(cell);
262 addAfterRowFamilyQualifier(cell);
263 }
264
265
266
267
268 private void addAfterRowFamilyQualifier(Cell cell){
269
270 timestamps[totalCells] = cell.getTimestamp();
271 timestampEncoder.add(cell.getTimestamp());
272
273
274 if (includeMvccVersion) {
275 mvccVersions[totalCells] = cell.getMvccVersion();
276 mvccVersionEncoder.add(cell.getMvccVersion());
277 totalUnencodedBytes += WritableUtils.getVIntSize(cell.getMvccVersion());
278 }else{
279
280 mvccVersions[totalCells] = 0L;
281 if(totalCells == 0){
282 mvccVersionEncoder.add(0L);
283 }
284
285 }
286
287
288 typeBytes[totalCells] = cell.getTypeByte();
289 cellTypeEncoder.add(cell.getTypeByte());
290
291
292 totalValueBytes += cell.getValueLength();
293
294 values = ArrayUtils.growIfNecessary(values, totalValueBytes, 2 * totalValueBytes);
295 CellUtil.copyValueTo(cell, values, valueOffsets[totalCells]);
296 if (cell.getValueLength() > maxValueLength) {
297 maxValueLength = cell.getValueLength();
298 }
299 valueOffsets[totalCells + 1] = totalValueBytes;
300
301
302 totalUnencodedBytes += KeyValueUtil.length(cell);
303 ++totalCells;
304 }
305
306 private void addFamilyPart(Cell cell) {
307 if (MULITPLE_FAMILIES_POSSIBLE || totalCells == 0) {
308 CellUtil.fillFamilyRange(cell, familyRange);
309 familyDeduplicator.add(familyRange);
310 }
311 }
312
313 private void addQualifierPart(Cell cell) {
314 CellUtil.fillQualifierRange(cell, qualifierRange);
315 qualifierDeduplicator.add(qualifierRange);
316 }
317
318
319
320
321
322
323
324
325
326
327 @Override
328 public void flush() throws IOException {
329 compile();
330
331
332 blockMeta.writeVariableBytesToOutputStream(outputStream);
333 rowWriter.writeBytes(outputStream);
334 familyWriter.writeBytes(outputStream);
335 qualifierWriter.writeBytes(outputStream);
336 timestampEncoder.writeBytes(outputStream);
337 mvccVersionEncoder.writeBytes(outputStream);
338
339 outputStream.write(values, 0, totalValueBytes);
340 }
341
342
343
344
345
346 protected void compile(){
347 blockMeta.setNumKeyValueBytes(totalUnencodedBytes);
348 int lastValueOffset = valueOffsets[totalCells];
349 blockMeta.setValueOffsetWidth(UFIntTool.numBytes(lastValueOffset));
350 blockMeta.setValueLengthWidth(UFIntTool.numBytes(maxValueLength));
351 blockMeta.setNumValueBytes(totalValueBytes);
352 totalBytes += totalValueBytes;
353
354
355 compileTypes();
356 compileMvccVersions();
357 compileTimestamps();
358 compileQualifiers();
359 compileFamilies();
360 compileRows();
361
362 int numMetaBytes = blockMeta.calculateNumMetaBytes();
363 blockMeta.setNumMetaBytes(numMetaBytes);
364 totalBytes += numMetaBytes;
365 }
366
367
368
369
370
371
372
373
374
375 protected void compileTypes() {
376 blockMeta.setAllSameType(cellTypeEncoder.areAllSameType());
377 if(cellTypeEncoder.areAllSameType()){
378 blockMeta.setAllTypes(cellTypeEncoder.getOnlyType());
379 }
380 }
381
382 protected void compileMvccVersions() {
383 mvccVersionEncoder.compile();
384 blockMeta.setMvccVersionFields(mvccVersionEncoder);
385 int numMvccVersionBytes = mvccVersionEncoder.getOutputArrayLength();
386 totalBytes += numMvccVersionBytes;
387 }
388
389 protected void compileTimestamps() {
390 timestampEncoder.compile();
391 blockMeta.setTimestampFields(timestampEncoder);
392 int numTimestampBytes = timestampEncoder.getOutputArrayLength();
393 totalBytes += numTimestampBytes;
394 }
395
396 protected void compileQualifiers() {
397 blockMeta.setNumUniqueQualifiers(qualifierDeduplicator.size());
398 qualifierDeduplicator.compile();
399 qualifierTokenizer.addAll(qualifierDeduplicator.getSortedRanges());
400 qualifierWriter.reconstruct(blockMeta, qualifierTokenizer, false);
401 qualifierWriter.compile();
402 int numQualifierBytes = qualifierWriter.getNumBytes();
403 blockMeta.setNumQualifierBytes(numQualifierBytes);
404 totalBytes += numQualifierBytes;
405 }
406
407 protected void compileFamilies() {
408 blockMeta.setNumUniqueFamilies(familyDeduplicator.size());
409 familyDeduplicator.compile();
410 familyTokenizer.addAll(familyDeduplicator.getSortedRanges());
411 familyWriter.reconstruct(blockMeta, familyTokenizer, true);
412 familyWriter.compile();
413 int numFamilyBytes = familyWriter.getNumBytes();
414 blockMeta.setNumFamilyBytes(numFamilyBytes);
415 totalBytes += numFamilyBytes;
416 }
417
418 protected void compileRows() {
419 rowWriter.reconstruct(this);
420 rowWriter.compile();
421 int numRowBytes = rowWriter.getNumBytes();
422 blockMeta.setNumRowBytes(numRowBytes);
423 blockMeta.setRowTreeDepth(rowTokenizer.getTreeDepth());
424 totalBytes += numRowBytes;
425 }
426
427
428
429 public long getValueOffset(int index) {
430 return valueOffsets[index];
431 }
432
433 public int getValueLength(int index) {
434 return (int) (valueOffsets[index + 1] - valueOffsets[index]);
435 }
436
437
438
439 public PrefixTreeBlockMeta getBlockMeta() {
440 return blockMeta;
441 }
442
443 public Tokenizer getRowTokenizer() {
444 return rowTokenizer;
445 }
446
447 public LongEncoder getTimestampEncoder() {
448 return timestampEncoder;
449 }
450
451 public int getTotalBytes() {
452 return totalBytes;
453 }
454
455 public long[] getTimestamps() {
456 return timestamps;
457 }
458
459 public long[] getMvccVersions() {
460 return mvccVersions;
461 }
462
463 public byte[] getTypeBytes() {
464 return typeBytes;
465 }
466
467 public LongEncoder getMvccVersionEncoder() {
468 return mvccVersionEncoder;
469 }
470
471 public ByteRangeSet getFamilySorter() {
472 return familyDeduplicator;
473 }
474
475 public ByteRangeSet getQualifierSorter() {
476 return qualifierDeduplicator;
477 }
478
479 public ColumnSectionWriter getFamilyWriter() {
480 return familyWriter;
481 }
482
483 public ColumnSectionWriter getQualifierWriter() {
484 return qualifierWriter;
485 }
486
487 public RowSectionWriter getRowWriter() {
488 return rowWriter;
489 }
490
491 public ByteRange getValueByteRange() {
492 return new SimpleByteRange(values, 0, totalValueBytes);
493 }
494
495 }