View Javadoc

1   /*
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package org.apache.hadoop.hbase.util;
22  
23  import org.apache.hadoop.io.Writable;
24  
25  import java.io.DataInput;
26  import java.io.DataOutput;
27  import java.io.IOException;
28  import java.nio.ByteBuffer;
29  
30  /**
31   * Implements a <i>dynamic Bloom filter</i>, as defined in the INFOCOM 2006 paper.
32   * <p>
33   * A dynamic Bloom filter (DBF) makes use of a <code>s * m</code> bit matrix but
34   * each of the <code>s</code> rows is a standard Bloom filter. The creation
35   * process of a DBF is iterative. At the start, the DBF is a <code>1 * m</code>
36   * bit matrix, i.e., it is composed of a single standard Bloom filter.
37   * It assumes that <code>n<sub>r</sub></code> elements are recorded in the
38   * initial bit vector, where <code>n<sub>r</sub> <= n</code> (<code>n</code> is
39   * the cardinality of the set <code>A</code> to record in the filter).
40   * <p>
41   * As the size of <code>A</code> grows during the execution of the application,
42   * several keys must be inserted in the DBF.  When inserting a key into the DBF,
43   * one must first get an active Bloom filter in the matrix.  A Bloom filter is
44   * active when the number of recorded keys, <code>n<sub>r</sub></code>, is
45   * strictly less than the current cardinality of <code>A</code>, <code>n</code>.
46   * If an active Bloom filter is found, the key is inserted and
47   * <code>n<sub>r</sub></code> is incremented by one. On the other hand, if there
48   * is no active Bloom filter, a new one is created (i.e., a new row is added to
49   * the matrix) according to the current size of <code>A</code> and the element
50   * is added in this new Bloom filter and the <code>n<sub>r</sub></code> value of
51   * this new Bloom filter is set to one.  A given key is said to belong to the
52   * DBF if the <code>k</code> positions are set to one in one of the matrix rows.
53   * <p>
54   * Originally created by
55   * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
56   *
57   * @see BloomFilter A Bloom filter
58   *
59   * @see <a href="http://www.cse.fau.edu/~jie/research/publications/Publication_files/infocom2006.pdf">Theory and Network Applications of Dynamic Bloom Filters</a>
60   */
61  public class DynamicByteBloomFilter implements BloomFilter {
62    /** Current file format version */
63    public static final int VERSION = 2;
64    /** Maximum number of keys in a dynamic Bloom filter row. */
65    protected final int keyInterval;
66    /** The maximum false positive rate per bloom */
67    protected final float errorRate;
68    /** Hash type */
69    protected final int hashType;
70    /** The number of keys recorded in the current Bloom filter. */
71    protected int curKeys;
72    /** expected size of bloom filter matrix (used during reads) */
73    protected int readMatrixSize;
74    /** The matrix of Bloom filters (contains bloom data only during writes). */
75    protected ByteBloomFilter[] matrix;
76  
77    /**
78     * Normal read constructor.  Loads bloom filter meta data.
79     * @param meta stored bloom meta data
80     * @throws IllegalArgumentException meta data is invalid
81     */
82    public DynamicByteBloomFilter(ByteBuffer meta) throws IllegalArgumentException {
83      int version = meta.getInt();
84      if (version != VERSION) throw new IllegalArgumentException("Bad version");
85  
86      this.keyInterval = meta.getInt();
87      this.errorRate  = meta.getFloat();
88      this.hashType = meta.getInt();
89      this.readMatrixSize = meta.getInt();
90      this.curKeys = meta.getInt();
91  
92      readSanityCheck();
93  
94      this.matrix = new ByteBloomFilter[1];
95      this.matrix[0] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
96  }
97  
98    /**
99     * Normal write constructor.  Note that this doesn't allocate bloom data by
100    * default.  Instead, call allocBloom() before adding entries.
101    * @param errorRate
102    * @param hashType type of the hashing function (see <code>org.apache.hadoop.util.hash.Hash</code>).
103    * @param keyInterval Maximum number of keys to record per Bloom filter row.
104    * @throws IllegalArgumentException The input parameters were invalid
105    */
106   public DynamicByteBloomFilter(int keyInterval, float errorRate, int hashType)
107   throws IllegalArgumentException {
108     this.keyInterval = keyInterval;
109     this.errorRate = errorRate;
110     this.hashType = hashType;
111     this.curKeys = 0;
112 
113     if(keyInterval <= 0) {
114       throw new IllegalArgumentException("keyCount must be > 0");
115     }
116 
117     this.matrix = new ByteBloomFilter[1];
118     this.matrix[0] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
119 }
120 
121   @Override
122   public void allocBloom() {
123     this.matrix[0].allocBloom();
124   }
125 
126   void readSanityCheck() throws IllegalArgumentException {
127     if (this.curKeys <= 0) {
128       throw new IllegalArgumentException("last bloom's key count invalid");
129     }
130 
131     if (this.readMatrixSize <= 0) {
132       throw new IllegalArgumentException("matrix size must be known");
133     }
134   }
135 
136   @Override
137   public void add(byte []buf, int offset, int len) {
138     BloomFilter bf = getCurBloom();
139 
140     if (bf == null) {
141       addRow();
142       bf = matrix[matrix.length - 1];
143       curKeys = 0;
144     }
145 
146     bf.add(buf, offset, len);
147     curKeys++;
148   }
149 
150   @Override
151   public void add(byte []buf) {
152     add(buf, 0, buf.length);
153   }
154 
155   /**
156    * Should only be used in tests when writing a bloom filter.
157    */
158   boolean contains(byte [] buf) {
159     return contains(buf, 0, buf.length);
160   }
161 
162   /**
163    * Should only be used in tests when writing a bloom filter.
164    */
165   boolean contains(byte [] buf, int offset, int length) {
166     for (int i = 0; i < matrix.length; i++) {
167       if (matrix[i].contains(buf, offset, length)) {
168         return true;
169       }
170     }
171     return false;
172   }
173 
174   @Override
175   public boolean contains(byte [] buf, ByteBuffer theBloom) {
176     return contains(buf, 0, buf.length, theBloom);
177   }
178 
179   @Override
180   public boolean contains(byte[] buf, int offset, int length,
181       ByteBuffer theBloom) {
182     if(offset + length > buf.length) {
183       return false;
184     }
185 
186     // current version assumes uniform size
187     int bytesPerBloom = this.matrix[0].getByteSize();
188 
189     if(theBloom.limit() != bytesPerBloom * readMatrixSize) {
190       throw new IllegalArgumentException("Bloom does not match expected size");
191     }
192 
193     ByteBuffer tmp = theBloom.duplicate();
194 
195     // note: actually searching an array of blooms that have been serialized
196     for (int m = 0; m < readMatrixSize; ++m) {
197       tmp.position(m* bytesPerBloom);
198       tmp.limit(tmp.position() + bytesPerBloom);
199       boolean match = this.matrix[0].contains(buf, offset, length, tmp.slice());
200       if (match) {
201         return true;
202       }
203     }
204 
205     // matched no bloom filters
206     return false;
207   }
208 
209   int bloomCount() {
210     return Math.max(this.matrix.length, this.readMatrixSize);
211   }
212 
213   @Override
214   public int getKeyCount() {
215     return (bloomCount()-1) * this.keyInterval + this.curKeys;
216   }
217 
218   @Override
219   public int getMaxKeys() {
220     return bloomCount() * this.keyInterval;
221   }
222 
223   @Override
224   public int getByteSize() {
225     return bloomCount() * this.matrix[0].getByteSize();
226   }
227 
228   @Override
229   public void compactBloom() {
230   }
231 
232   /**
233    * Adds a new row to <i>this</i> dynamic Bloom filter.
234    */
235   private void addRow() {
236     ByteBloomFilter[] tmp = new ByteBloomFilter[matrix.length + 1];
237 
238     for (int i = 0; i < matrix.length; i++) {
239       tmp[i] = matrix[i];
240     }
241 
242     tmp[tmp.length-1] = new ByteBloomFilter(keyInterval, errorRate, hashType, 0);
243     tmp[tmp.length-1].allocBloom();
244     matrix = tmp;
245   }
246 
247   /**
248    * Returns the currently-unfilled row in the dynamic Bloom Filter array.
249    * @return BloomFilter The active standard Bloom filter.
250    * 			 <code>Null</code> otherwise.
251    */
252   private BloomFilter getCurBloom() {
253     if (curKeys >= keyInterval) {
254       return null;
255     }
256 
257     return matrix[matrix.length - 1];
258   }
259 
260   @Override
261   public Writable getMetaWriter() {
262     return new MetaWriter();
263   }
264 
265   @Override
266   public Writable getDataWriter() {
267     return new DataWriter();
268   }
269 
270   private class MetaWriter implements Writable {
271     protected MetaWriter() {}
272     @Override
273     public void readFields(DataInput arg0) throws IOException {
274       throw new IOException("Cant read with this class.");
275     }
276 
277     @Override
278     public void write(DataOutput out) throws IOException {
279       out.writeInt(VERSION);
280       out.writeInt(keyInterval);
281       out.writeFloat(errorRate);
282       out.writeInt(hashType);
283       out.writeInt(matrix.length);
284       out.writeInt(curKeys);
285     }
286   }
287 
288   private class DataWriter implements Writable {
289     protected DataWriter() {}
290     @Override
291     public void readFields(DataInput arg0) throws IOException {
292       throw new IOException("Cant read with this class.");
293     }
294 
295     @Override
296     public void write(DataOutput out) throws IOException {
297       for (int i = 0; i < matrix.length; ++i) {
298         matrix[i].writeBloom(out);
299       }
300     }
301   }
302 }