1   /*
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import static org.junit.Assert.assertEquals;
23  import static org.junit.Assert.assertFalse;
24  import static org.junit.Assert.assertTrue;
25  
26  import java.io.IOException;
27  import java.util.ArrayList;
28  import java.util.Collection;
29  import java.util.Collections;
30  import java.util.HashMap;
31  import java.util.HashSet;
32  import java.util.List;
33  import java.util.Map;
34  import java.util.Random;
35  import java.util.Set;
36  import java.util.TreeSet;
37  
38  import org.apache.commons.lang.ArrayUtils;
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import org.apache.hadoop.hbase.HBaseTestingUtility;
42  import org.apache.hadoop.hbase.HColumnDescriptor;
43  import org.apache.hadoop.hbase.KeyValue;
44  import org.apache.hadoop.hbase.KeyValueTestUtil;
45  import org.apache.hadoop.hbase.MediumTests;
46  import org.apache.hadoop.hbase.client.Delete;
47  import org.apache.hadoop.hbase.client.Put;
48  import org.apache.hadoop.hbase.client.Scan;
49  import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
50  import org.apache.hadoop.hbase.io.hfile.Compression;
51  import org.apache.hadoop.hbase.io.hfile.HFile;
52  import org.apache.hadoop.hbase.regionserver.metrics.SchemaMetrics;
53  import org.apache.hadoop.hbase.util.Bytes;
54  import org.junit.Before;
55  import org.junit.Test;
56  import org.junit.experimental.categories.Category;
57  import org.junit.runner.RunWith;
58  import org.junit.runners.Parameterized;
59  import org.junit.runners.Parameterized.Parameters;
60  
61  /**
62   * Tests optimized scanning of multiple columns.
63   */
64  @RunWith(Parameterized.class)
65  @Category(MediumTests.class)
66  public class TestMultiColumnScanner {
67  
68    private static final Log LOG = LogFactory.getLog(TestMultiColumnScanner.class);
69  
70    private static final String TABLE_NAME =
71        TestMultiColumnScanner.class.getSimpleName();
72  
73    static final int MAX_VERSIONS = 50;
74  
75    private static final String FAMILY = "CF";
76    private static final byte[] FAMILY_BYTES = Bytes.toBytes(FAMILY);
77  
78    /**
79     * The size of the column qualifier set used. Increasing this parameter
80     * exponentially increases test time.
81     */
82    private static final int NUM_COLUMNS = 8;
83  
84    private static final int MAX_COLUMN_BIT_MASK = 1 << NUM_COLUMNS - 1;
85    private static final int NUM_FLUSHES = 10;
86    private static final int NUM_ROWS = 20;
87  
88    /** A large value of type long for use as a timestamp */
89    private static final long BIG_LONG = 9111222333444555666L;
90  
91    /**
92     * Timestamps to test with. Cannot use {@link Long#MAX_VALUE} here, because
93     * it will be replaced by an timestamp auto-generated based on the time.
94     */
95    private static final long[] TIMESTAMPS = new long[] { 1, 3, 5,
96        Integer.MAX_VALUE, BIG_LONG, Long.MAX_VALUE - 1 };
97  
98    /** The probability that a column is skipped in a store file. */
99    private static final double COLUMN_SKIP_IN_STORE_FILE_PROB = 0.7;
100 
101   /** The probability of skipping a column in a single row */
102   private static final double COLUMN_SKIP_IN_ROW_PROB = 0.1;
103 
104   /** The probability of skipping a column everywhere */
105   private static final double COLUMN_SKIP_EVERYWHERE_PROB = 0.1;
106 
107   /** The probability to delete a row/column pair */
108   private static final double DELETE_PROBABILITY = 0.02;
109 
110   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
111 
112   private final Compression.Algorithm comprAlgo;
113   private final StoreFile.BloomType bloomType;
114   private final DataBlockEncoding dataBlockEncoding;
115 
116   // Some static sanity-checking.
117   static {
118     assertTrue(BIG_LONG > 0.9 * Long.MAX_VALUE); // Guard against typos.
119 
120     // Ensure TIMESTAMPS are sorted.
121     for (int i = 0; i < TIMESTAMPS.length - 1; ++i)
122       assertTrue(TIMESTAMPS[i] < TIMESTAMPS[i + 1]);
123   }
124 
125   @Before
126   public void setUp() {
127     SchemaMetrics.configureGlobally(TEST_UTIL.getConfiguration());
128   }
129 
130 
131   @Parameters
132   public static final Collection<Object[]> parameters() {
133     List<Object[]> parameters = new ArrayList<Object[]>();
134     for (Object[] bloomAndCompressionParams : 
135         HBaseTestingUtility.BLOOM_AND_COMPRESSION_COMBINATIONS) {
136       for (boolean useDataBlockEncoding : new boolean[]{false, true}) {
137         parameters.add(ArrayUtils.add(bloomAndCompressionParams,
138             useDataBlockEncoding));
139       }
140     }
141     return parameters;
142   }
143 
144   public TestMultiColumnScanner(Compression.Algorithm comprAlgo,
145       StoreFile.BloomType bloomType, boolean useDataBlockEncoding) {
146     this.comprAlgo = comprAlgo;
147     this.bloomType = bloomType;
148     this.dataBlockEncoding = useDataBlockEncoding ? DataBlockEncoding.PREFIX :
149         DataBlockEncoding.NONE;
150   }
151 
152   @Test
153   public void testMultiColumnScanner() throws IOException {
154     HRegion region = TEST_UTIL.createTestRegion(TABLE_NAME,
155         new HColumnDescriptor(FAMILY)
156             .setCompressionType(comprAlgo)
157             .setBloomFilterType(bloomType)
158             .setMaxVersions(MAX_VERSIONS)
159             .setDataBlockEncoding(dataBlockEncoding)
160     );
161     List<String> rows = sequentialStrings("row", NUM_ROWS);
162     List<String> qualifiers = sequentialStrings("qual", NUM_COLUMNS);
163     List<KeyValue> kvs = new ArrayList<KeyValue>();
164     Set<String> keySet = new HashSet<String>();
165 
166     // A map from <row>_<qualifier> to the most recent delete timestamp for
167     // that column.
168     Map<String, Long> lastDelTimeMap = new HashMap<String, Long>();
169 
170     Random rand = new Random(29372937L);
171     Set<String> rowQualSkip = new HashSet<String>();
172 
173     // Skip some columns in some rows. We need to test scanning over a set
174     // of columns when some of the columns are not there.
175     for (String row : rows)
176       for (String qual : qualifiers)
177         if (rand.nextDouble() < COLUMN_SKIP_IN_ROW_PROB) {
178           LOG.info("Skipping " + qual + " in row " + row);
179           rowQualSkip.add(rowQualKey(row, qual));
180         }
181 
182     // Also skip some columns in all rows.
183     for (String qual : qualifiers)
184       if (rand.nextDouble() < COLUMN_SKIP_EVERYWHERE_PROB) {
185         LOG.info("Skipping " + qual + " in all rows");
186         for (String row : rows)
187           rowQualSkip.add(rowQualKey(row, qual));
188       }
189 
190     for (int iFlush = 0; iFlush < NUM_FLUSHES; ++iFlush) {
191       for (String qual : qualifiers) {
192         // This is where we decide to include or not include this column into
193         // this store file, regardless of row and timestamp.
194         if (rand.nextDouble() < COLUMN_SKIP_IN_STORE_FILE_PROB)
195           continue;
196 
197         byte[] qualBytes = Bytes.toBytes(qual);
198         for (String row : rows) {
199           Put p = new Put(Bytes.toBytes(row));
200           for (long ts : TIMESTAMPS) {
201             String value = createValue(row, qual, ts);
202             KeyValue kv = KeyValueTestUtil.create(row, FAMILY, qual, ts,
203                 value);
204             assertEquals(kv.getTimestamp(), ts);
205             p.add(kv);
206             String keyAsString = kv.toString();
207             if (!keySet.contains(keyAsString)) {
208               keySet.add(keyAsString);
209               kvs.add(kv);
210             }
211           }
212           region.put(p);
213 
214           Delete d = new Delete(Bytes.toBytes(row));
215           boolean deletedSomething = false;
216           for (long ts : TIMESTAMPS)
217             if (rand.nextDouble() < DELETE_PROBABILITY) {
218               d.deleteColumns(FAMILY_BYTES, qualBytes, ts);
219               String rowAndQual = row + "_" + qual;
220               Long whenDeleted = lastDelTimeMap.get(rowAndQual);
221               lastDelTimeMap.put(rowAndQual, whenDeleted == null ? ts
222                   : Math.max(ts, whenDeleted));
223               deletedSomething = true;
224             }
225           if (deletedSomething)
226             region.delete(d, null, true);
227         }
228       }
229       region.flushcache();
230     }
231 
232     Collections.sort(kvs, KeyValue.COMPARATOR);
233     for (int maxVersions = 1; maxVersions <= TIMESTAMPS.length; ++maxVersions) {
234       for (int columnBitMask = 1; columnBitMask <= MAX_COLUMN_BIT_MASK; ++columnBitMask) {
235         Scan scan = new Scan();
236         scan.setMaxVersions(maxVersions);
237         Set<String> qualSet = new TreeSet<String>();
238         {
239           int columnMaskTmp = columnBitMask;
240           for (String qual : qualifiers) {
241             if ((columnMaskTmp & 1) != 0) {
242               scan.addColumn(FAMILY_BYTES, Bytes.toBytes(qual));
243               qualSet.add(qual);
244             }
245             columnMaskTmp >>= 1;
246           }
247           assertEquals(0, columnMaskTmp);
248         }
249 
250         InternalScanner scanner = region.getScanner(scan);
251         List<KeyValue> results = new ArrayList<KeyValue>();
252 
253         int kvPos = 0;
254         int numResults = 0;
255         String queryInfo = "columns queried: " + qualSet + " (columnBitMask="
256             + columnBitMask + "), maxVersions=" + maxVersions;
257 
258         while (scanner.next(results) || results.size() > 0) {
259           for (KeyValue kv : results) {
260             while (kvPos < kvs.size()
261                 && !matchesQuery(kvs.get(kvPos), qualSet, maxVersions,
262                     lastDelTimeMap)) {
263               ++kvPos;
264             }
265             String rowQual = getRowQualStr(kv);
266             String deleteInfo = "";
267             Long lastDelTS = lastDelTimeMap.get(rowQual);
268             if (lastDelTS != null) {
269               deleteInfo = "; last timestamp when row/column " + rowQual
270                   + " was deleted: " + lastDelTS;
271             }
272             assertTrue("Scanner returned additional key/value: " + kv + ", "
273                 + queryInfo + deleteInfo + ";", kvPos < kvs.size());
274             assertEquals("Scanner returned wrong key/value; " + queryInfo
275                 + deleteInfo + ";", kvs.get(kvPos), kv);
276             ++kvPos;
277             ++numResults;
278           }
279           results.clear();
280         }
281         for (; kvPos < kvs.size(); ++kvPos) {
282           KeyValue remainingKV = kvs.get(kvPos);
283           assertFalse("Matching column not returned by scanner: "
284               + remainingKV + ", " + queryInfo + ", results returned: "
285               + numResults, matchesQuery(remainingKV, qualSet, maxVersions,
286               lastDelTimeMap));
287         }
288       }
289     }
290     assertTrue("This test is supposed to delete at least some row/column " +
291         "pairs", lastDelTimeMap.size() > 0);
292     LOG.info("Number of row/col pairs deleted at least once: " +
293        lastDelTimeMap.size());
294     region.close();
295     region.getLog().closeAndDelete();
296   }
297 
298   private static String getRowQualStr(KeyValue kv) {
299     String rowStr = Bytes.toString(kv.getBuffer(), kv.getRowOffset(),
300         kv.getRowLength());
301     String qualStr = Bytes.toString(kv.getBuffer(), kv.getQualifierOffset(),
302         kv.getQualifierLength());
303     return rowStr + "_" + qualStr;
304   }
305 
306   private static boolean matchesQuery(KeyValue kv, Set<String> qualSet,
307       int maxVersions, Map<String, Long> lastDelTimeMap) {
308     Long lastDelTS = lastDelTimeMap.get(getRowQualStr(kv));
309     long ts = kv.getTimestamp();
310     return qualSet.contains(qualStr(kv))
311         && ts >= TIMESTAMPS[TIMESTAMPS.length - maxVersions]
312         && (lastDelTS == null || ts > lastDelTS);
313   }
314 
315   private static String qualStr(KeyValue kv) {
316     return Bytes.toString(kv.getBuffer(), kv.getQualifierOffset(),
317         kv.getQualifierLength());
318   }
319 
320   private static String rowQualKey(String row, String qual) {
321     return row + "_" + qual;
322   }
323 
324   static String createValue(String row, String qual, long ts) {
325     return "value_for_" + row + "_" + qual + "_" + ts;
326   }
327 
328   private static List<String> sequentialStrings(String prefix, int n) {
329     List<String> lst = new ArrayList<String>();
330     for (int i = 0; i < n; ++i) {
331       StringBuilder sb = new StringBuilder();
332       sb.append(prefix + i);
333 
334       // Make column length depend on i.
335       int iBitShifted = i;
336       while (iBitShifted != 0) {
337         sb.append((iBitShifted & 1) == 0 ? 'a' : 'b');
338         iBitShifted >>= 1;
339       }
340 
341       lst.add(sb.toString());
342     }
343 
344     return lst;
345   }
346 
347 
348   @org.junit.Rule
349   public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
350     new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();
351 }
352