View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize;
20  
21  import java.util.ArrayList;
22  import java.util.List;
23  
24  import org.apache.hadoop.classification.InterfaceAudience;
25  import org.apache.hadoop.hbase.util.ByteRange;
26  import org.apache.hadoop.hbase.util.Bytes;
27  import org.apache.hadoop.hbase.util.CollectionUtils;
28  import org.apache.hadoop.hbase.util.Strings;
29  
30  import com.google.common.collect.Lists;
31  
32  /**
33   * Individual node in a Trie structure.  Each node is one of 3 types:
34   * <li>Branch: an internal trie node that may have a token and must have multiple children, but does
35   * not represent an actual input byte[], hence its numOccurrences is 0
36   * <li>Leaf: a node with no children and where numOccurrences is >= 1.  It's token represents the
37   * last bytes in the input byte[]s.
38   * <li>Nub: a combination of a branch and leaf.  Its token represents the last bytes of input
39   * byte[]s and has numOccurrences >= 1, but it also has child nodes which represent input byte[]s
40   * that add bytes to this nodes input byte[].
41   * <br/><br/>
42   * Example inputs (numInputs=7):
43   * 0: AAA
44   * 1: AAA
45   * 2: AAB
46   * 3: AAB
47   * 4: AAB
48   * 5: AABQQ
49   * 6: AABQQ
50   * <br/><br/>
51   * Resulting TokenizerNodes:
52   * AA <- branch, numOccurrences=0, tokenStartOffset=0, token.length=2
53   * A  <- leaf, numOccurrences=2, tokenStartOffset=2, token.length=1
54   * B  <- nub, numOccurrences=3, tokenStartOffset=2, token.length=1
55   * QQ <- leaf, numOccurrences=2, tokenStartOffset=3, token.length=2
56   * <br/><br/>
57   * numInputs == 7 == sum(numOccurrences) == 0 + 2 + 3 + 2
58   */
59  @InterfaceAudience.Private
60  public class TokenizerNode{
61  
62    /*
63     * Ref to data structure wrapper
64     */
65    protected Tokenizer builder;
66  
67    /****************************************************************** 
68     * Tree content/structure used during tokenization 
69     * ****************************************************************/
70  
71    /*
72     * ref to parent trie node
73     */
74    protected TokenizerNode parent;
75  
76    /*
77     * node depth in trie, irrespective of each node's token length
78     */
79    protected int nodeDepth;
80  
81    /*
82     * start index of this token in original byte[]
83     */
84    protected int tokenStartOffset;
85  
86    /*
87     * bytes for this trie node.  can be length 0 in root node
88     */
89    protected ByteRange token;
90  
91    /*
92     * A count of occurrences in the input byte[]s, not the trie structure. 0 for branch nodes, 1+ for
93     * nubs and leaves. If the same byte[] is added to the trie multiple times, this is the only thing
94     * that changes in the tokenizer. As a result, duplicate byte[]s are very inexpensive to encode.
95     */
96    protected int numOccurrences;
97  
98    /*
99     * The maximum fan-out of a byte[] trie is 256, so there are a maximum of 256
100    * child nodes.
101    */
102   protected ArrayList<TokenizerNode> children;
103 
104 
105   /*
106    * Fields used later in the encoding process for sorting the nodes into the order they'll be
107    * written to the output byte[].  With these fields, the TokenizerNode and therefore Tokenizer
108    * are not generic data structures but instead are specific to HBase PrefixTree encoding. 
109    */
110 
111   /*
112    * unique id assigned to each TokenizerNode
113    */
114   protected long id;
115 
116   /*
117    * set >=0 for nubs and leaves
118    */
119   protected int firstInsertionIndex = -1;
120 
121   /*
122    * A positive value indicating how many bytes before the end of the block this node will start. If
123    * the section is 55 bytes and negativeOffset is 9, then the node will start at 46.
124    */
125   protected int negativeIndex = 0;
126 
127   /*
128    * The offset in the output array at which to start writing this node's token bytes.  Influenced
129    * by the lengths of all tokens sorted before this one.
130    */
131   protected int outputArrayOffset = -1;
132 
133 
134   /*********************** construct *****************************/
135 
136   public TokenizerNode(Tokenizer builder, TokenizerNode parent, int nodeDepth,
137       int tokenStartOffset, int tokenOffset, int tokenLength) {
138     this.token = new ByteRange();
139     reconstruct(builder, parent, nodeDepth, tokenStartOffset, tokenOffset, tokenLength);
140     this.children = Lists.newArrayList();
141   }
142 
143   /*
144    * Sub-constructor for initializing all fields without allocating a new object.  Used by the
145    * regular constructor.
146    */
147   public void reconstruct(Tokenizer builder, TokenizerNode parent, int nodeDepth,
148       int tokenStartOffset, int tokenOffset, int tokenLength) {
149     this.builder = builder;
150     this.id = builder.nextNodeId();
151     this.parent = parent;
152     this.nodeDepth = nodeDepth;
153     builder.submitMaxNodeDepthCandidate(nodeDepth);
154     this.tokenStartOffset = tokenStartOffset;
155     this.token.set(builder.tokens, tokenOffset, tokenLength);
156     this.numOccurrences = 1;
157   }
158 
159   /*
160    * Clear the state of this node so that it looks like it was just allocated.
161    */
162   public void reset() {
163     builder = null;
164     parent = null;
165     nodeDepth = 0;
166     tokenStartOffset = 0;
167     token.clear();
168     numOccurrences = 0;
169     children.clear();// branches & nubs
170 
171     // ids/offsets. used during writing to byte[]
172     id = 0;
173     firstInsertionIndex = -1;// set >=0 for nubs and leaves
174     negativeIndex = 0;
175     outputArrayOffset = -1;
176   }
177 
178 
179   /************************* building *********************************/
180 
181   /*
182    * <li>Only public method used during the tokenization process
183    * <li>Requires that the input ByteRange sort after the previous, and therefore after all previous
184    * inputs
185    * <li>Only looks at bytes of the input array that align with this node's token
186    */
187   public void addSorted(final ByteRange bytes) {// recursively build the tree
188 
189     /*
190      * Recurse deeper into the existing trie structure
191      */
192     if (matchesToken(bytes) && CollectionUtils.notEmpty(children)) {
193       TokenizerNode lastChild = CollectionUtils.getLast(children);
194       if (lastChild.partiallyMatchesToken(bytes)) {
195         lastChild.addSorted(bytes);
196         return;
197       }
198     }
199 
200     /*
201      * Recursion ended.  We must either
202      * <li>1: increment numOccurrences if this input was equal to the previous
203      * <li>2: convert this node from a leaf to a nub, and add a new child leaf
204      * <li>3: split this node into a branch and leaf, and then add a second leaf
205      */
206 
207     // add it as a child of this node
208     int numIdenticalTokenBytes = numIdenticalBytes(bytes);// should be <= token.length
209     int tailOffset = tokenStartOffset + numIdenticalTokenBytes;
210     int tailLength = bytes.getLength() - tailOffset;
211 
212     if (numIdenticalTokenBytes == token.getLength()) {
213       if (tailLength == 0) {// identical to this node (case 1)
214         incrementNumOccurrences(1);
215       } else {// identical to this node, but with a few extra tailing bytes. (leaf -> nub) (case 2)
216         int childNodeDepth = nodeDepth + 1;
217         int childTokenStartOffset = tokenStartOffset + numIdenticalTokenBytes;
218         TokenizerNode newChildNode = builder.addNode(this, childNodeDepth, childTokenStartOffset,
219           bytes, tailOffset);
220         addChild(newChildNode);
221       }
222     } else {//numIdenticalBytes > 0, split into branch/leaf and then add second leaf (case 3)
223       split(numIdenticalTokenBytes, bytes);
224     }
225   }
226 
227 
228   protected void addChild(TokenizerNode node) {
229     node.setParent(this);
230     children.add(node);
231   }
232 
233 
234   /**
235    * Called when we need to convert a leaf node into a branch with 2 leaves. Comments inside the
236    * method assume we have token BAA starting at tokenStartOffset=0 and are adding BOO. The output
237    * will be 3 nodes:<br/>
238    * <li>1: B <- branch
239    * <li>2: AA <- leaf
240    * <li>3: OO <- leaf
241    *
242    * @param numTokenBytesToRetain => 1 (the B)
243    * @param bytes => BOO
244    */
245   protected void split(int numTokenBytesToRetain, final ByteRange bytes) {
246     int childNodeDepth = nodeDepth;
247     int childTokenStartOffset = tokenStartOffset + numTokenBytesToRetain;
248 
249     //create leaf AA
250     TokenizerNode firstChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
251       token, numTokenBytesToRetain);
252     firstChild.setNumOccurrences(numOccurrences);// do before clearing this node's numOccurrences
253     token.setLength(numTokenBytesToRetain);//shorten current token from BAA to B
254     numOccurrences = 0;//current node is now a branch
255 
256     moveChildrenToDifferentParent(firstChild);//point the new leaf (AA) to the new branch (B)
257     addChild(firstChild);//add the new leaf (AA) to the branch's (B's) children
258 
259     //create leaf OO
260     TokenizerNode secondChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
261       bytes, tokenStartOffset + numTokenBytesToRetain);
262     addChild(secondChild);//add the new leaf (00) to the branch's (B's) children
263 
264     // we inserted branch node B as a new level above/before the two children, so increment the
265     // depths of the children below
266     firstChild.incrementNodeDepthRecursively();
267     secondChild.incrementNodeDepthRecursively();
268   }
269 
270 
271   protected void incrementNodeDepthRecursively() {
272     ++nodeDepth;
273     builder.submitMaxNodeDepthCandidate(nodeDepth);
274     for (int i = 0; i < children.size(); ++i) {
275       children.get(i).incrementNodeDepthRecursively();
276     }
277   }
278 
279 
280   protected void moveChildrenToDifferentParent(TokenizerNode newParent) {
281     for (int i = 0; i < children.size(); ++i) {
282       TokenizerNode child = children.get(i);
283       child.setParent(newParent);
284       newParent.children.add(child);
285     }
286     children.clear();
287   }
288 
289 
290 	/************************ byte[] utils *************************/
291 
292   protected boolean partiallyMatchesToken(ByteRange bytes) {
293     return numIdenticalBytes(bytes) > 0;
294   }
295 
296   protected boolean matchesToken(ByteRange bytes) {
297     return numIdenticalBytes(bytes) == getTokenLength();
298   }
299 
300   protected int numIdenticalBytes(ByteRange bytes) {
301     return token.numEqualPrefixBytes(bytes, tokenStartOffset);
302   }
303 
304 
305 	/***************** moving nodes around ************************/
306 
307   public void appendNodesToExternalList(List<TokenizerNode> appendTo, boolean includeNonLeaves,
308       boolean includeLeaves) {
309     if (includeNonLeaves && !isLeaf() || includeLeaves && isLeaf()) {
310       appendTo.add(this);
311     }
312     for (int i = 0; i < children.size(); ++i) {
313       TokenizerNode child = children.get(i);
314       child.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
315     }
316   }
317 
318   public int setInsertionIndexes(int nextIndex) {
319     int newNextIndex = nextIndex;
320     if (hasOccurrences()) {
321       setFirstInsertionIndex(nextIndex);
322       newNextIndex += numOccurrences;
323     }
324     for (int i = 0; i < children.size(); ++i) {
325       TokenizerNode child = children.get(i);
326       newNextIndex = child.setInsertionIndexes(newNextIndex);
327     }
328     return newNextIndex;
329   }
330 
331   public void appendOutputArrayOffsets(List<Integer> offsets) {
332     if (hasOccurrences()) {
333       offsets.add(outputArrayOffset);
334     }
335     for (int i = 0; i < children.size(); ++i) {
336       TokenizerNode child = children.get(i);
337       child.appendOutputArrayOffsets(offsets);
338     }
339   }
340 
341 
342   /***************** searching *********************************/
343 
344   /*
345    * Do a trie style search through the tokenizer.  One option for looking up families or qualifiers
346    * during encoding, but currently unused in favor of tracking this information as they are added.
347    *
348    * Keeping code pending further performance testing.
349    */
350   public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
351       int keyLength) {
352     int thisNodeDepthPlusLength = tokenStartOffset + token.getLength();
353 
354     // quick check if the key is shorter than this node (may not work for binary search)
355     if (CollectionUtils.isEmpty(children)) {
356       if (thisNodeDepthPlusLength < keyLength) {// ran out of bytes
357         resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
358         return;
359       }
360     }
361 
362     // all token bytes must match
363     for (int i = 0; i < token.getLength(); ++i) {
364       if (key[tokenStartOffset + keyOffset + i] != token.get(i)) {
365         // TODO return whether it's before or after so we can binary search
366         resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
367         return;
368       }
369     }
370 
371     if (thisNodeDepthPlusLength == keyLength && numOccurrences > 0) {
372       resultHolder.set(TokenizerRowSearchPosition.MATCH, this);// MATCH
373       return;
374     }
375 
376     if (CollectionUtils.notEmpty(children)) {
377       // TODO binary search the children
378       for (int i = 0; i < children.size(); ++i) {
379         TokenizerNode child = children.get(i);
380         child.getNode(resultHolder, key, keyOffset, keyLength);
381         if (resultHolder.isMatch()) {
382           return;
383         } else if (resultHolder.getDifference() == TokenizerRowSearchPosition.BEFORE) {
384           // passed it, so it doesn't exist
385           resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
386           return;
387         }
388         // key is still AFTER the current node, so continue searching
389       }
390     }
391 
392     // checked all children (or there were no children), and didn't find it
393     resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
394     return;
395   }
396 
397 
398   /****************** writing back to byte[]'s *************************/
399 
400   public byte[] getNewByteArray() {
401     byte[] arrayToFill = new byte[tokenStartOffset + token.getLength()];
402     fillInBytes(arrayToFill);
403     return arrayToFill;
404   }
405 
406   public void fillInBytes(byte[] arrayToFill) {
407     for (int i = 0; i < token.getLength(); ++i) {
408       arrayToFill[tokenStartOffset + i] = token.get(i);
409     }
410     if (parent != null) {
411       parent.fillInBytes(arrayToFill);
412     }
413   }
414 
415 
416   /************************** printing ***********************/
417 
418   @Override
419   public String toString() {
420     String s = "";
421     if (parent == null) {
422       s += "R ";
423     } else {
424       s += getBnlIndicator(false) + " " + Bytes.toString(parent.getNewByteArray());
425     }
426     s += "[" + Bytes.toString(token.deepCopyToNewArray()) + "]";
427     if (numOccurrences > 0) {
428       s += "x" + numOccurrences;
429     }
430     return s;
431   }
432 
433   public String getPaddedTokenAndOccurrenceString() {
434     StringBuilder sb = new StringBuilder();
435     sb.append(getBnlIndicator(true));
436     sb.append(Strings.padFront(numOccurrences + "", ' ', 3));
437     sb.append(Strings.padFront(nodeDepth + "", ' ', 3));
438     if (outputArrayOffset >= 0) {
439       sb.append(Strings.padFront(outputArrayOffset + "", ' ', 3));
440     }
441     sb.append("  ");
442     for (int i = 0; i < tokenStartOffset; ++i) {
443       sb.append(" ");
444     }
445     sb.append(Bytes.toString(token.deepCopyToNewArray()).replaceAll(" ", "_"));
446     return sb.toString();
447   }
448 
449   public String getBnlIndicator(boolean indent) {
450     if (indent) {
451       if (isNub()) {
452         return " N ";
453       }
454       return isBranch() ? "B  " : "  L";
455     }
456     if (isNub()) {
457       return "N";
458     }
459     return isBranch() ? "B" : "L";
460   }
461 
462 
463 	/********************** count different node types ********************/
464 
465   public int getNumBranchNodesIncludingThisNode() {
466     if (isLeaf()) {
467       return 0;
468     }
469     int totalFromThisPlusChildren = isBranch() ? 1 : 0;
470     for (int i = 0; i < children.size(); ++i) {
471       TokenizerNode child = children.get(i);
472       totalFromThisPlusChildren += child.getNumBranchNodesIncludingThisNode();
473     }
474     return totalFromThisPlusChildren;
475   }
476 
477   public int getNumNubNodesIncludingThisNode() {
478     if (isLeaf()) {
479       return 0;
480     }
481     int totalFromThisPlusChildren = isNub() ? 1 : 0;
482     for (int i = 0; i < children.size(); ++i) {
483       TokenizerNode child = children.get(i);
484       totalFromThisPlusChildren += child.getNumNubNodesIncludingThisNode();
485     }
486     return totalFromThisPlusChildren;
487   }
488 
489   public int getNumLeafNodesIncludingThisNode() {
490     if (isLeaf()) {
491       return 1;
492     }
493     int totalFromChildren = 0;
494     for (int i = 0; i < children.size(); ++i) {
495       TokenizerNode child = children.get(i);
496       totalFromChildren += child.getNumLeafNodesIncludingThisNode();
497     }
498     return totalFromChildren;
499   }
500 
501 
502   /*********************** simple read-only methods *******************************/
503 
504   public int getNodeDepth() {
505     return nodeDepth;
506   }
507 
508   public int getTokenLength() {
509     return token.getLength();
510   }
511 
512   public boolean hasOccurrences() {
513     return numOccurrences > 0;
514   }
515 
516   public boolean isRoot() {
517     return this.parent == null;
518   }
519 
520   public int getNumChildren() {
521     return CollectionUtils.nullSafeSize(children);
522   }
523 
524   public TokenizerNode getLastChild() {
525     if (CollectionUtils.isEmpty(children)) {
526       return null;
527     }
528     return CollectionUtils.getLast(children);
529   }
530 
531   public boolean isLeaf() {
532     return CollectionUtils.isEmpty(children) && hasOccurrences();
533   }
534 
535   public boolean isBranch() {
536     return CollectionUtils.notEmpty(children) && !hasOccurrences();
537   }
538 
539   public boolean isNub() {
540     return CollectionUtils.notEmpty(children) && hasOccurrences();
541   }
542 
543 
544   /********************** simple mutation methods *************************/
545 
546   /**
547    * Each occurrence > 1 indicates a repeat of the previous entry.  This can be called directly by
548    * an external class without going through the process of detecting a repeat if it is a known
549    * repeat by some external mechanism.  PtEncoder uses this when adding cells to a row if it knows
550    * the new cells are part of the current row.
551    * @param d increment by this amount
552    */
553   public void incrementNumOccurrences(int d) {
554     numOccurrences += d;
555   }
556 
557 
558   /************************* autogenerated get/set ******************/
559 
560   public int getTokenOffset() {
561     return tokenStartOffset;
562   }
563 
564   public TokenizerNode getParent() {
565     return parent;
566   }
567 
568   public ByteRange getToken() {
569     return token;
570   }
571 
572   public int getNumOccurrences() {
573     return numOccurrences;
574   }
575 
576   public void setParent(TokenizerNode parent) {
577     this.parent = parent;
578   }
579 
580   public void setNumOccurrences(int numOccurrences) {
581     this.numOccurrences = numOccurrences;
582   }
583 
584   public ArrayList<TokenizerNode> getChildren() {
585     return children;
586   }
587 
588   public long getId() {
589     return id;
590   }
591 
592   public int getFirstInsertionIndex() {
593     return firstInsertionIndex;
594   }
595 
596   public void setFirstInsertionIndex(int firstInsertionIndex) {
597     this.firstInsertionIndex = firstInsertionIndex;
598   }
599 
600   public int getNegativeIndex() {
601     return negativeIndex;
602   }
603 
604   public void setNegativeIndex(int negativeIndex) {
605     this.negativeIndex = negativeIndex;
606   }
607 
608   public int getOutputArrayOffset() {
609     return outputArrayOffset;
610   }
611 
612   public void setOutputArrayOffset(int outputArrayOffset) {
613     this.outputArrayOffset = outputArrayOffset;
614   }
615 
616   public void setId(long id) {
617     this.id = id;
618   }
619 
620   public void setBuilder(Tokenizer builder) {
621     this.builder = builder;
622   }
623 
624   public void setTokenOffset(int tokenOffset) {
625     this.tokenStartOffset = tokenOffset;
626   }
627 
628   public void setToken(ByteRange token) {
629     this.token = token;
630   }
631 
632 }