View Javadoc

1   /*
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package org.apache.hadoop.hbase.util;
22  
23  import org.apache.hadoop.io.Writable;
24  
25  import java.io.DataInput;
26  import java.io.DataOutput;
27  import java.io.IOException;
28  import java.nio.ByteBuffer;
29  
30  /**
31   * Implements a <i>dynamic Bloom filter</i>, as defined in the INFOCOM 2006 paper.
32   * <p>
33   * A dynamic Bloom filter (DBF) makes use of a <code>s * m</code> bit matrix but
34   * each of the <code>s</code> rows is a standard Bloom filter. The creation
35   * process of a DBF is iterative. At the start, the DBF is a <code>1 * m</code>
36   * bit matrix, i.e., it is composed of a single standard Bloom filter.
37   * It assumes that <code>n<sub>r</sub></code> elements are recorded in the
38   * initial bit vector, where <code>n<sub>r</sub> <= n</code> (<code>n</code> is
39   * the cardinality of the set <code>A</code> to record in the filter).
40   * <p>
41   * As the size of <code>A</code> grows during the execution of the application,
42   * several keys must be inserted in the DBF.  When inserting a key into the DBF,
43   * one must first get an active Bloom filter in the matrix.  A Bloom filter is
44   * active when the number of recorded keys, <code>n<sub>r</sub></code>, is
45   * strictly less than the current cardinality of <code>A</code>, <code>n</code>.
46   * If an active Bloom filter is found, the key is inserted and
47   * <code>n<sub>r</sub></code> is incremented by one. On the other hand, if there
48   * is no active Bloom filter, a new one is created (i.e., a new row is added to
49   * the matrix) according to the current size of <code>A</code> and the element
50   * is added in this new Bloom filter and the <code>n<sub>r</sub></code> value of
51   * this new Bloom filter is set to one.  A given key is said to belong to the
52   * DBF if the <code>k</code> positions are set to one in one of the matrix rows.
53   * <p>
54   * Originally created by
55   * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
56   *
57   * @see BloomFilter A Bloom filter
58   *
59   * @see <a href="http://www.cse.fau.edu/~jie/research/publications/Publication_files/infocom2006.pdf">Theory and Network Applications of Dynamic Bloom Filters</a>
60   */
61  public class DynamicByteBloomFilter implements BloomFilter {
62    /** Current file format version */
63    public static final int VERSION = 2;
64    /** Maximum number of keys in a dynamic Bloom filter row. */
65    protected final int keyInterval;
66    /** The maximum false positive rate per bloom */
67    protected final float errorRate;
68    /** Hash type */
69    protected final int hashType;
70    /** The number of keys recorded in the current Bloom filter. */
71    protected int curKeys;
72    /** expected size of bloom filter matrix (used during reads) */
73    protected int readMatrixSize;
74    /** The matrix of Bloom filters (contains bloom data only during writes). */
75    protected ByteBloomFilter[] matrix;
76  
77    /**
78     * Normal read constructor.  Loads bloom filter meta data.
79     * @param meta stored bloom meta data
80     * @throws IllegalArgumentException meta data is invalid
81     */
82    public DynamicByteBloomFilter(ByteBuffer meta) throws IllegalArgumentException {
83      int version = meta.getInt();
84      if (version != VERSION) throw new IllegalArgumentException("Bad version");
85  
86      this.keyInterval = meta.getInt();
87      this.errorRate  = meta.getFloat();
88      this.hashType = meta.getInt();
89      this.readMatrixSize = meta.getInt();
90      this.curKeys = meta.getInt();
91  
92      readSanityCheck();
93  
94      this.matrix = new ByteBloomFilter[1];
95      this.matrix[0] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
96  }
97  
98    /**
99     * Normal write constructor.  Note that this doesn't allocate bloom data by
100    * default.  Instead, call allocBloom() before adding entries.
101    * @param hashType type of the hashing function (see {@link org.apache.hadoop.util.hash.Hash}).
102    * @param keyInterval Maximum number of keys to record per Bloom filter row.
103    * @throws IllegalArgumentException The input parameters were invalid
104    */
105   public DynamicByteBloomFilter(int keyInterval, float errorRate, int hashType)
106   throws IllegalArgumentException {
107     this.keyInterval = keyInterval;
108     this.errorRate = errorRate;
109     this.hashType = hashType;
110     this.curKeys = 0;
111 
112     if(keyInterval <= 0) {
113       throw new IllegalArgumentException("keyCount must be > 0");
114     }
115 
116     this.matrix = new ByteBloomFilter[1];
117     this.matrix[0] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
118 }
119 
120   @Override
121   public void allocBloom() {
122     this.matrix[0].allocBloom();
123   }
124 
125   void readSanityCheck() throws IllegalArgumentException {
126     if (this.curKeys <= 0) {
127       throw new IllegalArgumentException("last bloom's key count invalid");
128     }
129 
130     if (this.readMatrixSize <= 0) {
131       throw new IllegalArgumentException("matrix size must be known");
132     }
133   }
134 
135   @Override
136   public void add(byte []buf, int offset, int len) {
137     BloomFilter bf = getCurBloom();
138 
139     if (bf == null) {
140       addRow();
141       bf = matrix[matrix.length - 1];
142       curKeys = 0;
143     }
144 
145     bf.add(buf, offset, len);
146     curKeys++;
147   }
148 
149   @Override
150   public void add(byte []buf) {
151     add(buf, 0, buf.length);
152   }
153 
154   /**
155    * Should only be used in tests when writing a bloom filter.
156    */
157   boolean contains(byte [] buf) {
158     return contains(buf, 0, buf.length);
159   }
160 
161   /**
162    * Should only be used in tests when writing a bloom filter.
163    */
164   boolean contains(byte [] buf, int offset, int length) {
165     for (int i = 0; i < matrix.length; i++) {
166       if (matrix[i].contains(buf, offset, length)) {
167         return true;
168       }
169     }
170     return false;
171   }
172 
173   @Override
174   public boolean contains(byte [] buf, ByteBuffer theBloom) {
175     return contains(buf, 0, buf.length, theBloom);
176   }
177 
178   @Override
179   public boolean contains(byte[] buf, int offset, int length,
180       ByteBuffer theBloom) {
181     if(offset + length > buf.length) {
182       return false;
183     }
184 
185     // current version assumes uniform size
186     int bytesPerBloom = this.matrix[0].getByteSize();
187 
188     if(theBloom.limit() != bytesPerBloom * readMatrixSize) {
189       throw new IllegalArgumentException("Bloom does not match expected size");
190     }
191 
192     ByteBuffer tmp = theBloom.duplicate();
193 
194     // note: actually searching an array of blooms that have been serialized
195     for (int m = 0; m < readMatrixSize; ++m) {
196       tmp.position(m* bytesPerBloom);
197       tmp.limit(tmp.position() + bytesPerBloom);
198       boolean match = this.matrix[0].contains(buf, offset, length, tmp.slice());
199       if (match) {
200         return true;
201       }
202     }
203 
204     // matched no bloom filters
205     return false;
206   }
207 
208   int bloomCount() {
209     return Math.max(this.matrix.length, this.readMatrixSize);
210   }
211 
212   @Override
213   public int getKeyCount() {
214     return (bloomCount()-1) * this.keyInterval + this.curKeys;
215   }
216 
217   @Override
218   public int getMaxKeys() {
219     return bloomCount() * this.keyInterval;
220   }
221 
222   @Override
223   public int getByteSize() {
224     return bloomCount() * this.matrix[0].getByteSize();
225   }
226 
227   @Override
228   public void compactBloom() {
229   }
230 
231   /**
232    * Adds a new row to <i>this</i> dynamic Bloom filter.
233    */
234   private void addRow() {
235     ByteBloomFilter[] tmp = new ByteBloomFilter[matrix.length + 1];
236 
237     for (int i = 0; i < matrix.length; i++) {
238       tmp[i] = matrix[i];
239     }
240 
241     tmp[tmp.length-1] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
242     tmp[tmp.length-1].allocBloom();
243     matrix = tmp;
244   }
245 
246   /**
247    * Returns the currently-unfilled row in the dynamic Bloom Filter array.
248    * @return BloomFilter The active standard Bloom filter.
249    * 			 <code>Null</code> otherwise.
250    */
251   private BloomFilter getCurBloom() {
252     if (curKeys >= keyInterval) {
253       return null;
254     }
255 
256     return matrix[matrix.length - 1];
257   }
258 
259   @Override
260   public Writable getMetaWriter() {
261     return new MetaWriter();
262   }
263 
264   @Override
265   public Writable getDataWriter() {
266     return new DataWriter();
267   }
268 
269   private class MetaWriter implements Writable {
270     protected MetaWriter() {}
271     @Override
272     public void readFields(DataInput arg0) throws IOException {
273       throw new IOException("Cant read with this class.");
274     }
275 
276     @Override
277     public void write(DataOutput out) throws IOException {
278       out.writeInt(VERSION);
279       out.writeInt(keyInterval);
280       out.writeFloat(errorRate);
281       out.writeInt(hashType);
282       out.writeInt(matrix.length);
283       out.writeInt(curKeys);
284     }
285   }
286 
287   private class DataWriter implements Writable {
288     protected DataWriter() {}
289     @Override
290     public void readFields(DataInput arg0) throws IOException {
291       throw new IOException("Cant read with this class.");
292     }
293 
294     @Override
295     public void write(DataOutput out) throws IOException {
296       for (int i = 0; i < matrix.length; ++i) {
297         matrix[i].writeBloom(out);
298       }
299     }
300   }
301 }