org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize
Class TokenizerNode

java.lang.Object
  extended by org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.TokenizerNode

@InterfaceAudience.Private
public class TokenizerNode
extends Object

Individual node in a Trie structure. Each node is one of 3 types:

  • Branch: an internal trie node that may have a token and must have multiple children, but does not represent an actual input byte[], hence its numOccurrences is 0
  • Leaf: a node with no children and where numOccurrences is >= 1. It's token represents the last bytes in the input byte[]s.
  • Nub: a combination of a branch and leaf. Its token represents the last bytes of input byte[]s and has numOccurrences >= 1, but it also has child nodes which represent input byte[]s that add bytes to this nodes input byte[].

    Example inputs (numInputs=7): 0: AAA 1: AAA 2: AAB 3: AAB 4: AAB 5: AABQQ 6: AABQQ

    Resulting TokenizerNodes: AA <- branch, numOccurrences=0, tokenStartOffset=0, token.length=2 A <- leaf, numOccurrences=2, tokenStartOffset=2, token.length=1 B <- nub, numOccurrences=3, tokenStartOffset=2, token.length=1 QQ <- leaf, numOccurrences=2, tokenStartOffset=3, token.length=2

    numInputs == 7 == sum(numOccurrences) == 0 + 2 + 3 + 2


    Field Summary
    protected  Tokenizer builder
               
    protected  ArrayList<TokenizerNode> children
               
    protected  int firstInsertionIndex
               
    protected  long id
               
    protected  int negativeIndex
               
    protected  int nodeDepth
               
    protected  int numOccurrences
               
    protected  int outputArrayOffset
               
    protected  TokenizerNode parent
              Tree content/structure used during tokenization
    protected  ByteRange token
               
    protected  int tokenStartOffset
               
     
    Constructor Summary
    TokenizerNode(Tokenizer builder, TokenizerNode parent, int nodeDepth, int tokenStartOffset, int tokenOffset, int tokenLength)
              construct
     
    Method Summary
    protected  void addChild(TokenizerNode node)
               
     void addSorted(ByteRange bytes)
              building
     void appendNodesToExternalList(List<TokenizerNode> appendTo, boolean includeNonLeaves, boolean includeLeaves)
              moving nodes around
     void appendOutputArrayOffsets(List<Integer> offsets)
               
     void fillInBytes(byte[] arrayToFill)
               
     String getBnlIndicator(boolean indent)
               
     ArrayList<TokenizerNode> getChildren()
               
     int getFirstInsertionIndex()
               
     long getId()
               
     TokenizerNode getLastChild()
               
     int getNegativeIndex()
               
     byte[] getNewByteArray()
              writing back to byte[]'s
     void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset, int keyLength)
              searching
     int getNodeDepth()
              simple read-only methods
     int getNumBranchNodesIncludingThisNode()
              count different node types
     int getNumChildren()
               
     int getNumLeafNodesIncludingThisNode()
               
     int getNumNubNodesIncludingThisNode()
               
     int getNumOccurrences()
               
     int getOutputArrayOffset()
               
     String getPaddedTokenAndOccurrenceString()
               
     TokenizerNode getParent()
               
     ByteRange getToken()
               
     int getTokenLength()
               
     int getTokenOffset()
              autogenerated get/set
     boolean hasOccurrences()
               
    protected  void incrementNodeDepthRecursively()
               
     void incrementNumOccurrences(int d)
              Each occurrence > 1 indicates a repeat of the previous entry.
     boolean isBranch()
               
     boolean isLeaf()
               
     boolean isNub()
               
     boolean isRoot()
               
    protected  boolean matchesToken(ByteRange bytes)
               
    protected  void moveChildrenToDifferentParent(TokenizerNode newParent)
               
    protected  int numIdenticalBytes(ByteRange bytes)
               
    protected  boolean partiallyMatchesToken(ByteRange bytes)
              byte[] utils
     void reconstruct(Tokenizer builder, TokenizerNode parent, int nodeDepth, int tokenStartOffset, int tokenOffset, int tokenLength)
               
     void reset()
               
     void setBuilder(Tokenizer builder)
               
     void setFirstInsertionIndex(int firstInsertionIndex)
               
     void setId(long id)
               
     int setInsertionIndexes(int nextIndex)
               
     void setNegativeIndex(int negativeIndex)
               
     void setNumOccurrences(int numOccurrences)
               
     void setOutputArrayOffset(int outputArrayOffset)
               
     void setParent(TokenizerNode parent)
               
     void setToken(ByteRange token)
               
     void setTokenOffset(int tokenOffset)
               
    protected  void split(int numTokenBytesToRetain, ByteRange bytes)
              Called when we need to convert a leaf node into a branch with 2 leaves.
     String toString()
              printing
     
    Methods inherited from class java.lang.Object
    clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
     

    Field Detail

    builder

    protected Tokenizer builder

    parent

    protected TokenizerNode parent
    Tree content/structure used during tokenization


    nodeDepth

    protected int nodeDepth

    tokenStartOffset

    protected int tokenStartOffset

    token

    protected ByteRange token

    numOccurrences

    protected int numOccurrences

    children

    protected ArrayList<TokenizerNode> children

    id

    protected long id

    firstInsertionIndex

    protected int firstInsertionIndex

    negativeIndex

    protected int negativeIndex

    outputArrayOffset

    protected int outputArrayOffset
    Constructor Detail

    TokenizerNode

    public TokenizerNode(Tokenizer builder,
                         TokenizerNode parent,
                         int nodeDepth,
                         int tokenStartOffset,
                         int tokenOffset,
                         int tokenLength)
    construct

    Method Detail

    reconstruct

    public void reconstruct(Tokenizer builder,
                            TokenizerNode parent,
                            int nodeDepth,
                            int tokenStartOffset,
                            int tokenOffset,
                            int tokenLength)

    reset

    public void reset()

    addSorted

    public void addSorted(ByteRange bytes)
    building


    addChild

    protected void addChild(TokenizerNode node)

    split

    protected void split(int numTokenBytesToRetain,
                         ByteRange bytes)
    Called when we need to convert a leaf node into a branch with 2 leaves. Comments inside the method assume we have token BAA starting at tokenStartOffset=0 and are adding BOO. The output will be 3 nodes:
  • 1: B <- branch
  • 2: AA <- leaf
  • 3: OO <- leaf

    Parameters:
    numTokenBytesToRetain - => 1 (the B)
    bytes - => BOO

  • incrementNodeDepthRecursively

    protected void incrementNodeDepthRecursively()

    moveChildrenToDifferentParent

    protected void moveChildrenToDifferentParent(TokenizerNode newParent)

    partiallyMatchesToken

    protected boolean partiallyMatchesToken(ByteRange bytes)
    byte[] utils


    matchesToken

    protected boolean matchesToken(ByteRange bytes)

    numIdenticalBytes

    protected int numIdenticalBytes(ByteRange bytes)

    appendNodesToExternalList

    public void appendNodesToExternalList(List<TokenizerNode> appendTo,
                                          boolean includeNonLeaves,
                                          boolean includeLeaves)
    moving nodes around


    setInsertionIndexes

    public int setInsertionIndexes(int nextIndex)

    appendOutputArrayOffsets

    public void appendOutputArrayOffsets(List<Integer> offsets)

    getNode

    public void getNode(TokenizerRowSearchResult resultHolder,
                        byte[] key,
                        int keyOffset,
                        int keyLength)
    searching


    getNewByteArray

    public byte[] getNewByteArray()
    writing back to byte[]'s


    fillInBytes

    public void fillInBytes(byte[] arrayToFill)

    toString

    public String toString()
    printing

    Overrides:
    toString in class Object

    getPaddedTokenAndOccurrenceString

    public String getPaddedTokenAndOccurrenceString()

    getBnlIndicator

    public String getBnlIndicator(boolean indent)

    getNumBranchNodesIncludingThisNode

    public int getNumBranchNodesIncludingThisNode()
    count different node types


    getNumNubNodesIncludingThisNode

    public int getNumNubNodesIncludingThisNode()

    getNumLeafNodesIncludingThisNode

    public int getNumLeafNodesIncludingThisNode()

    getNodeDepth

    public int getNodeDepth()
    simple read-only methods


    getTokenLength

    public int getTokenLength()

    hasOccurrences

    public boolean hasOccurrences()

    isRoot

    public boolean isRoot()

    getNumChildren

    public int getNumChildren()

    getLastChild

    public TokenizerNode getLastChild()

    isLeaf

    public boolean isLeaf()

    isBranch

    public boolean isBranch()

    isNub

    public boolean isNub()

    incrementNumOccurrences

    public void incrementNumOccurrences(int d)
    Each occurrence > 1 indicates a repeat of the previous entry. This can be called directly by an external class without going through the process of detecting a repeat if it is a known repeat by some external mechanism. PtEncoder uses this when adding cells to a row if it knows the new cells are part of the current row.

    Parameters:
    d - increment by this amount

    getTokenOffset

    public int getTokenOffset()
    autogenerated get/set


    getParent

    public TokenizerNode getParent()

    getToken

    public ByteRange getToken()

    getNumOccurrences

    public int getNumOccurrences()

    setParent

    public void setParent(TokenizerNode parent)

    setNumOccurrences

    public void setNumOccurrences(int numOccurrences)

    getChildren

    public ArrayList<TokenizerNode> getChildren()

    getId

    public long getId()

    getFirstInsertionIndex

    public int getFirstInsertionIndex()

    setFirstInsertionIndex

    public void setFirstInsertionIndex(int firstInsertionIndex)

    getNegativeIndex

    public int getNegativeIndex()

    setNegativeIndex

    public void setNegativeIndex(int negativeIndex)

    getOutputArrayOffset

    public int getOutputArrayOffset()

    setOutputArrayOffset

    public void setOutputArrayOffset(int outputArrayOffset)

    setId

    public void setId(long id)

    setBuilder

    public void setBuilder(Tokenizer builder)

    setTokenOffset

    public void setTokenOffset(int tokenOffset)

    setToken

    public void setToken(ByteRange token)


    Copyright © 2007–2016 The Apache Software Foundation. All rights reserved.