View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import static org.junit.Assert.assertEquals;
23  import static org.junit.Assert.assertNotNull;
24  import static org.junit.Assert.assertTrue;
25  import static org.junit.Assert.fail;
26  
27  import java.io.IOException;
28  import java.util.ArrayList;
29  import java.util.Collections;
30  import java.util.List;
31  import java.util.Random;
32  import java.util.TreeSet;
33  
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FileSystem;
38  import org.apache.hadoop.fs.Path;
39  import org.apache.hadoop.hbase.*;
40  import org.apache.hadoop.hbase.client.Scan;
41  import org.apache.hadoop.hbase.io.hfile.BlockCache;
42  import org.apache.hadoop.hbase.io.hfile.CacheConfig;
43  import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
44  import org.apache.hadoop.hbase.io.hfile.HFile;
45  import org.apache.hadoop.hbase.io.hfile.TestHFileWriterV2;
46  import org.apache.hadoop.hbase.util.BloomFilterFactory;
47  import org.apache.hadoop.hbase.util.ByteBloomFilter;
48  import org.apache.hadoop.hbase.util.Bytes;
49  import org.apache.hadoop.hbase.util.CompoundBloomFilter;
50  import org.apache.hadoop.hbase.util.CompoundBloomFilterBase;
51  import org.apache.hadoop.hbase.util.CompoundBloomFilterWriter;
52  import org.junit.Before;
53  import org.junit.Test;
54  import org.junit.experimental.categories.Category;
55  
56  /**
57   * Tests writing Bloom filter blocks in the same part of the file as data
58   * blocks.
59   */
60  @Category(MediumTests.class)
61  public class TestCompoundBloomFilter {
62  
63    private static final HBaseTestingUtility TEST_UTIL =
64        new HBaseTestingUtility();
65  
66    private static final Log LOG = LogFactory.getLog(
67        TestCompoundBloomFilter.class);
68  
69    private static final int NUM_TESTS = 9;
70    private static final BloomType BLOOM_TYPES[] = { BloomType.ROW,
71        BloomType.ROW, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW,
72        BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW };
73  
74    private static final int NUM_KV[];
75    static {
76      final int N = 10000; // Only used in initialization.
77      NUM_KV = new int[] { 21870, N, N, N, N, 1000, N, 7500, 7500};
78      assert NUM_KV.length == NUM_TESTS;
79    }
80  
81    private static final int BLOCK_SIZES[];
82    static {
83      final int blkSize = 65536;
84      BLOCK_SIZES = new int[] { 512, 1000, blkSize, blkSize, blkSize, 128, 300,
85          blkSize, blkSize };
86      assert BLOCK_SIZES.length == NUM_TESTS;
87    }
88  
89    /**
90     * Be careful not to specify too high a Bloom filter block size, otherwise
91     * there will only be one oversized chunk and the observed false positive
92     * rate will be too low.
93     */
94    private static final int BLOOM_BLOCK_SIZES[] = { 1000, 4096, 4096, 4096,
95        8192, 128, 1024, 600, 600 };
96    static { assert BLOOM_BLOCK_SIZES.length == NUM_TESTS; }
97  
98    private static final double TARGET_ERROR_RATES[] = { 0.025, 0.01, 0.015,
99        0.01, 0.03, 0.01, 0.01, 0.07, 0.07 };
100   static { assert TARGET_ERROR_RATES.length == NUM_TESTS; }
101 
102   /** A false positive rate that is obviously too high. */
103   private static final double TOO_HIGH_ERROR_RATE;
104   static {
105     double m = 0;
106     for (double errorRate : TARGET_ERROR_RATES)
107       m = Math.max(m, errorRate);
108     TOO_HIGH_ERROR_RATE = m + 0.03;
109   }
110 
111   private static Configuration conf;
112   private static CacheConfig cacheConf;
113   private FileSystem fs;
114   private BlockCache blockCache;
115 
116   /** A message of the form "in test#<number>:" to include in logging. */
117   private String testIdMsg;
118 
119   private static final int GENERATION_SEED = 2319;
120   private static final int EVALUATION_SEED = 135;
121 
122   @Before
123   public void setUp() throws IOException {
124     conf = TEST_UTIL.getConfiguration();
125 
126     // This test requires the most recent HFile format (i.e. v2).
127     conf.setInt(HFile.FORMAT_VERSION_KEY, HFile.MAX_FORMAT_VERSION);
128 
129     fs = FileSystem.get(conf);
130 
131     cacheConf = new CacheConfig(conf);
132     blockCache = cacheConf.getBlockCache();
133     assertNotNull(blockCache);
134   }
135 
136   private List<KeyValue> createSortedKeyValues(Random rand, int n) {
137     List<KeyValue> kvList = new ArrayList<KeyValue>(n);
138     for (int i = 0; i < n; ++i)
139       kvList.add(TestHFileWriterV2.randomKeyValue(rand));
140     Collections.sort(kvList, KeyValue.COMPARATOR);
141     return kvList;
142   }
143 
144   @Test
145   public void testCompoundBloomFilter() throws IOException {
146     conf.setBoolean(BloomFilterFactory.IO_STOREFILE_BLOOM_ENABLED, true);
147     for (int t = 0; t < NUM_TESTS; ++t) {
148       conf.setFloat(BloomFilterFactory.IO_STOREFILE_BLOOM_ERROR_RATE,
149           (float) TARGET_ERROR_RATES[t]);
150 
151       testIdMsg = "in test #" + t + ":";
152       Random generationRand = new Random(GENERATION_SEED);
153       List<KeyValue> kvs = createSortedKeyValues(generationRand, NUM_KV[t]);
154       BloomType bt = BLOOM_TYPES[t];
155       Path sfPath = writeStoreFile(t, bt, kvs);
156       readStoreFile(t, bt, kvs, sfPath);
157     }
158   }
159 
160   /**
161    * Validates the false positive ratio by computing its z-value and comparing
162    * it to the provided threshold.
163    *
164    * @param falsePosRate experimental positive rate
165    * @param nTrials the number of Bloom filter checks
166    * @param zValueBoundary z-value boundary, positive for an upper bound and
167    *          negative for a lower bound
168    * @param cbf the compound Bloom filter we are using
169    * @param additionalMsg additional message to include in log output and
170    *          assertion failures
171    */
172   private void validateFalsePosRate(double falsePosRate, int nTrials,
173       double zValueBoundary, CompoundBloomFilter cbf, String additionalMsg) {
174     double p = BloomFilterFactory.getErrorRate(conf);
175     double zValue = (falsePosRate - p) / Math.sqrt(p * (1 - p) / nTrials);
176 
177     String assortedStatsStr = " (targetErrorRate=" + p + ", falsePosRate="
178         + falsePosRate + ", nTrials=" + nTrials + ")";
179     LOG.info("z-value is " + zValue + assortedStatsStr);
180 
181     boolean isUpperBound = zValueBoundary > 0;
182 
183     if (isUpperBound && zValue > zValueBoundary ||
184         !isUpperBound && zValue < zValueBoundary) {
185       String errorMsg = "False positive rate z-value " + zValue + " is "
186           + (isUpperBound ? "higher" : "lower") + " than " + zValueBoundary
187           + assortedStatsStr + ". Per-chunk stats:\n"
188           + cbf.formatTestingStats();
189       fail(errorMsg + additionalMsg);
190     }
191   }
192 
193   private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs,
194       Path sfPath) throws IOException {
195     StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt,
196         NoOpDataBlockEncoder.INSTANCE);
197     StoreFile.Reader r = sf.createReader();
198     final boolean pread = true; // does not really matter
199     StoreFileScanner scanner = r.getStoreFileScanner(true, pread);
200 
201     {
202       // Test for false negatives (not allowed).
203       int numChecked = 0;
204       for (KeyValue kv : kvs) {
205         byte[] row = kv.getRow();
206         boolean present = isInBloom(scanner, row, kv.getQualifier());
207         assertTrue(testIdMsg + " Bloom filter false negative on row "
208             + Bytes.toStringBinary(row) + " after " + numChecked
209             + " successful checks", present);
210         ++numChecked;
211       }
212     }
213 
214     // Test for false positives (some percentage allowed). We test in two modes:
215     // "fake lookup" which ignores the key distribution, and production mode.
216     for (boolean fakeLookupEnabled : new boolean[] { true, false }) {
217       ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled);
218       try {
219         String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ?
220             "enabled" : "disabled");
221         CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter();
222         cbf.enableTestingStats();
223         int numFalsePos = 0;
224         Random rand = new Random(EVALUATION_SEED);
225         int nTrials = NUM_KV[t] * 10;
226         for (int i = 0; i < nTrials; ++i) {
227           byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand);
228           if (isInBloom(scanner, query, bt, rand)) {
229             numFalsePos += 1;
230           }
231         }
232         double falsePosRate = numFalsePos * 1.0 / nTrials;
233         LOG.debug(String.format(testIdMsg
234             + " False positives: %d out of %d (%f)",
235             numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr);
236 
237         // Check for obvious Bloom filter crashes.
238         assertTrue("False positive is too high: " + falsePosRate + " (greater "
239             + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr,
240             falsePosRate < TOO_HIGH_ERROR_RATE);
241 
242         // Now a more precise check to see if the false positive rate is not
243         // too high. The reason we use a relaxed restriction for the real-world
244         // case as opposed to the "fake lookup" case is that our hash functions
245         // are not completely independent.
246 
247         double maxZValue = fakeLookupEnabled ? 1.96 : 2.5;
248         validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf,
249             fakeLookupModeStr);
250 
251         // For checking the lower bound we need to eliminate the last chunk,
252         // because it is frequently smaller and the false positive rate in it
253         // is too low. This does not help if there is only one under-sized
254         // chunk, though.
255         int nChunks = cbf.getNumChunks();
256         if (nChunks > 1) {
257           numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1);
258           nTrials -= cbf.getNumQueriesForTesting(nChunks - 1);
259           falsePosRate = numFalsePos * 1.0 / nTrials;
260           LOG.info(testIdMsg + " False positive rate without last chunk is " +
261               falsePosRate + fakeLookupModeStr);
262         }
263 
264         validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf,
265             fakeLookupModeStr);
266       } finally {
267         ByteBloomFilter.setFakeLookupMode(false);
268       }
269     }
270 
271     r.close(true); // end of test so evictOnClose
272   }
273 
274   private boolean isInBloom(StoreFileScanner scanner, byte[] row, BloomType bt,
275       Random rand) {
276     return isInBloom(scanner, row,
277         TestHFileWriterV2.randomRowOrQualifier(rand));
278   }
279 
280   private boolean isInBloom(StoreFileScanner scanner, byte[] row,
281       byte[] qualifier) {
282     Scan scan = new Scan(row, row);
283     TreeSet<byte[]> columns = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
284     columns.add(qualifier);
285     return scanner.shouldUseScanner(scan, columns, Long.MIN_VALUE);
286   }
287 
288   private Path writeStoreFile(int t, BloomType bt, List<KeyValue> kvs)
289       throws IOException {
290     conf.setInt(BloomFilterFactory.IO_STOREFILE_BLOOM_BLOCK_SIZE,
291         BLOOM_BLOCK_SIZES[t]);
292     conf.setBoolean(CacheConfig.CACHE_BLOCKS_ON_WRITE_KEY, true);
293     cacheConf = new CacheConfig(conf);
294 
295     StoreFile.Writer w = new StoreFile.WriterBuilder(conf, cacheConf, fs,
296         BLOCK_SIZES[t])
297             .withOutputDir(TEST_UTIL.getDataTestDir())
298             .withBloomType(bt)
299             .withChecksumType(HFile.DEFAULT_CHECKSUM_TYPE)
300             .withBytesPerChecksum(HFile.DEFAULT_BYTES_PER_CHECKSUM)
301             .build();
302 
303     assertTrue(w.hasGeneralBloom());
304     assertTrue(w.getGeneralBloomWriter() instanceof CompoundBloomFilterWriter);
305     CompoundBloomFilterWriter cbbf =
306         (CompoundBloomFilterWriter) w.getGeneralBloomWriter();
307 
308     int keyCount = 0;
309     KeyValue prev = null;
310     LOG.debug("Total keys/values to insert: " + kvs.size());
311     for (KeyValue kv : kvs) {
312       w.append(kv);
313 
314       // Validate the key count in the Bloom filter.
315       boolean newKey = true;
316       if (prev != null) {
317         newKey = !(bt == BloomType.ROW ? KeyValue.COMPARATOR.matchingRows(kv,
318             prev) : KeyValue.COMPARATOR.matchingRowColumn(kv, prev));
319       }
320       if (newKey)
321         ++keyCount;
322       assertEquals(keyCount, cbbf.getKeyCount());
323 
324       prev = kv;
325     }
326     w.close();
327 
328     return w.getPath();
329   }
330 
331   @Test
332   public void testCompoundBloomSizing() {
333     int bloomBlockByteSize = 4096;
334     int bloomBlockBitSize = bloomBlockByteSize * 8;
335     double targetErrorRate = 0.01;
336     long maxKeysPerChunk = ByteBloomFilter.idealMaxKeys(bloomBlockBitSize,
337         targetErrorRate);
338 
339     long bloomSize1 = bloomBlockByteSize * 8;
340     long bloomSize2 = ByteBloomFilter.computeBitSize(maxKeysPerChunk,
341         targetErrorRate);
342 
343     double bloomSizeRatio = (bloomSize2 * 1.0 / bloomSize1);
344     assertTrue(Math.abs(bloomSizeRatio - 0.9999) < 0.0001);
345   }
346 
347   @Test
348   public void testCreateKey() {
349     CompoundBloomFilterBase cbfb = new CompoundBloomFilterBase();
350     byte[] row = "myRow".getBytes();
351     byte[] qualifier = "myQualifier".getBytes();
352     byte[] rowKey = cbfb.createBloomKey(row, 0, row.length,
353         row, 0, 0);
354     byte[] rowColKey = cbfb.createBloomKey(row, 0, row.length,
355         qualifier, 0, qualifier.length);
356     KeyValue rowKV = KeyValue.createKeyValueFromKey(rowKey);
357     KeyValue rowColKV = KeyValue.createKeyValueFromKey(rowColKey);
358     assertEquals(rowKV.getTimestamp(), rowColKV.getTimestamp());
359     assertEquals(Bytes.toStringBinary(rowKV.getRow()),
360         Bytes.toStringBinary(rowColKV.getRow()));
361     assertEquals(0, rowKV.getQualifier().length);
362   }
363 
364 
365 }
366