1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements. See the NOTICE file distributed with this
4    * work for additional information regarding copyright ownership. The ASF
5    * licenses this file to you under the Apache License, Version 2.0 (the
6    * "License"); you may not use this file except in compliance with the License.
7    * You may obtain a copy of the License at
8    *
9    * http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14   * License for the specific language governing permissions and limitations
15   * under the License.
16   */
17  package org.apache.hadoop.hbase.io.encoding;
18  
19  import java.nio.ByteBuffer;
20  import java.util.ArrayList;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.Random;
26  
27  import org.apache.hadoop.hbase.KeyValue;
28  import org.apache.hadoop.hbase.util.ByteBufferUtils;
29  import org.apache.hadoop.io.WritableUtils;
30  
31  /**
32   * Generate list of key values which are very useful to test data block encoding
33   * and compression.
34   */
35  public class RedundantKVGenerator {
36    // row settings
37    static int DEFAULT_NUMBER_OF_ROW_PREFIXES = 10;
38    static int DEFAULT_AVERAGE_PREFIX_LENGTH = 6;
39    static int DEFAULT_PREFIX_LENGTH_VARIANCE = 3;
40    static int DEFAULT_AVERAGE_SUFFIX_LENGTH = 3;
41    static int DEFAULT_SUFFIX_LENGTH_VARIANCE = 3;
42    static int DEFAULT_NUMBER_OF_ROW = 500;
43  
44    // qualifier
45    static float DEFAULT_CHANCE_FOR_SAME_QUALIFIER = 0.5f;
46    static float DEFAULT_CHANCE_FOR_SIMILIAR_QUALIFIER = 0.4f;
47    static int DEFAULT_AVERAGE_QUALIFIER_LENGTH = 9;
48    static int DEFAULT_QUALIFIER_LENGTH_VARIANCE = 3;
49  
50    static int DEFAULT_COLUMN_FAMILY_LENGTH = 9;
51    static int DEFAULT_VALUE_LENGTH = 8;
52    static float DEFAULT_CHANCE_FOR_ZERO_VALUE = 0.5f;
53  
54    static int DEFAULT_BASE_TIMESTAMP_DIVIDE = 1000000;
55    static int DEFAULT_TIMESTAMP_DIFF_SIZE = 100000000;
56  
57    /**
58     * Default constructor, assumes all parameters from class constants.
59     */
60    public RedundantKVGenerator() {
61      this(new Random(42L),
62          DEFAULT_NUMBER_OF_ROW_PREFIXES,
63          DEFAULT_AVERAGE_PREFIX_LENGTH,
64          DEFAULT_PREFIX_LENGTH_VARIANCE,
65          DEFAULT_AVERAGE_SUFFIX_LENGTH,
66          DEFAULT_SUFFIX_LENGTH_VARIANCE,
67          DEFAULT_NUMBER_OF_ROW,
68  
69          DEFAULT_CHANCE_FOR_SAME_QUALIFIER,
70          DEFAULT_CHANCE_FOR_SIMILIAR_QUALIFIER,
71          DEFAULT_AVERAGE_QUALIFIER_LENGTH,
72          DEFAULT_QUALIFIER_LENGTH_VARIANCE,
73  
74          DEFAULT_COLUMN_FAMILY_LENGTH,
75          DEFAULT_VALUE_LENGTH,
76          DEFAULT_CHANCE_FOR_ZERO_VALUE,
77  
78          DEFAULT_BASE_TIMESTAMP_DIVIDE,
79          DEFAULT_TIMESTAMP_DIFF_SIZE
80      );
81    }
82  
83  
84    /**
85     * Various configuration options for generating key values
86     * @param randomizer pick things by random
87     */
88    public RedundantKVGenerator(Random randomizer,
89        int numberOfRowPrefixes,
90        int averagePrefixLength,
91        int prefixLengthVariance,
92        int averageSuffixLength,
93        int suffixLengthVariance,
94        int numberOfRows,
95  
96        float chanceForSameQualifier,
97        float chanceForSimiliarQualifier,
98        int averageQualifierLength,
99        int qualifierLengthVariance,
100 
101       int columnFamilyLength,
102       int valueLength,
103       float chanceForZeroValue,
104 
105       int baseTimestampDivide,
106       int timestampDiffSize
107       ) {
108     this.randomizer = randomizer;
109 
110     this.numberOfRowPrefixes = numberOfRowPrefixes;
111     this.averagePrefixLength = averagePrefixLength;
112     this.prefixLengthVariance = prefixLengthVariance;
113     this.averageSuffixLength = averageSuffixLength;
114     this.suffixLengthVariance = suffixLengthVariance;
115     this.numberOfRows = numberOfRows;
116 
117     this.chanceForSameQualifier = chanceForSameQualifier;
118     this.chanceForSimiliarQualifier = chanceForSimiliarQualifier;
119     this.averageQualifierLength = averageQualifierLength;
120     this.qualifierLengthVariance = qualifierLengthVariance;
121 
122     this.columnFamilyLength = columnFamilyLength;
123     this.valueLength = valueLength;
124     this.chanceForZeroValue = chanceForZeroValue;
125 
126     this.baseTimestampDivide = baseTimestampDivide;
127     this.timestampDiffSize = timestampDiffSize;
128   }
129 
130   /** Used to generate dataset */
131   private Random randomizer;
132 
133   // row settings
134   private int numberOfRowPrefixes;
135   private int averagePrefixLength = 6;
136   private int prefixLengthVariance = 3;
137   private int averageSuffixLength = 3;
138   private int suffixLengthVariance = 3;
139   private int numberOfRows = 500;
140 
141   // qualifier
142   private float chanceForSameQualifier = 0.5f;
143   private float chanceForSimiliarQualifier = 0.4f;
144   private int averageQualifierLength = 9;
145   private int qualifierLengthVariance = 3;
146 
147   private int columnFamilyLength = 9;
148   private int valueLength = 8;
149   private float chanceForZeroValue = 0.5f;
150 
151   private int baseTimestampDivide = 1000000;
152   private int timestampDiffSize = 100000000;
153 
154   private List<byte[]> generateRows() {
155     // generate prefixes
156     List<byte[]> prefixes = new ArrayList<byte[]>();
157     prefixes.add(new byte[0]);
158     for (int i = 1; i < numberOfRowPrefixes; ++i) {
159       int prefixLength = averagePrefixLength;
160       prefixLength += randomizer.nextInt(2 * prefixLengthVariance + 1) -
161           prefixLengthVariance;
162       byte[] newPrefix = new byte[prefixLength];
163       randomizer.nextBytes(newPrefix);
164       prefixes.add(newPrefix);
165     }
166 
167     // generate rest of the row
168     List<byte[]> rows = new ArrayList<byte[]>();
169     for (int i = 0; i < numberOfRows; ++i) {
170       int suffixLength = averageSuffixLength;
171       suffixLength += randomizer.nextInt(2 * suffixLengthVariance + 1) -
172           suffixLengthVariance;
173       int randomPrefix = randomizer.nextInt(prefixes.size());
174       byte[] row = new byte[prefixes.get(randomPrefix).length +
175                             suffixLength];
176       rows.add(row);
177     }
178 
179     return rows;
180   }
181 
182   /**
183    * Generate test data useful to test encoders.
184    * @param howMany How many Key values should be generated.
185    * @return sorted list of key values
186    */
187   public List<KeyValue> generateTestKeyValues(int howMany) {
188     List<KeyValue> result = new ArrayList<KeyValue>();
189 
190     List<byte[]> rows = generateRows();
191     Map<Integer, List<byte[]>> rowsToQualifier =
192         new HashMap<Integer, List<byte[]>>();
193 
194     byte[] family = new byte[columnFamilyLength];
195     randomizer.nextBytes(family);
196 
197     long baseTimestamp = Math.abs(randomizer.nextLong()) /
198         baseTimestampDivide;
199 
200     byte[] value = new byte[valueLength];
201 
202     for (int i = 0; i < howMany; ++i) {
203       long timestamp = baseTimestamp + randomizer.nextInt(
204           timestampDiffSize);
205       Integer rowId = randomizer.nextInt(rows.size());
206       byte[] row = rows.get(rowId);
207 
208       // generate qualifier, sometimes it is same, sometimes similar,
209       // occasionally completely different
210       byte[] qualifier;
211       float qualifierChance = randomizer.nextFloat();
212       if (!rowsToQualifier.containsKey(rowId) ||
213           qualifierChance > chanceForSameQualifier +
214           chanceForSimiliarQualifier) {
215         int qualifierLength = averageQualifierLength;
216         qualifierLength +=
217             randomizer.nextInt(2 * qualifierLengthVariance + 1) -
218             qualifierLengthVariance;
219         qualifier = new byte[qualifierLength];
220         randomizer.nextBytes(qualifier);
221 
222         // add it to map
223         if (!rowsToQualifier.containsKey(rowId)) {
224           rowsToQualifier.put(rowId, new ArrayList<byte[]>());
225         }
226         rowsToQualifier.get(rowId).add(qualifier);
227       } else if (qualifierChance > chanceForSameQualifier) {
228         // similar qualifier
229         List<byte[]> previousQualifiers = rowsToQualifier.get(rowId);
230         byte[] originalQualifier = previousQualifiers.get(
231             randomizer.nextInt(previousQualifiers.size()));
232 
233         qualifier = new byte[originalQualifier.length];
234         int commonPrefix = randomizer.nextInt(qualifier.length);
235         System.arraycopy(originalQualifier, 0, qualifier, 0, commonPrefix);
236         for (int j = commonPrefix; j < qualifier.length; ++j) {
237           qualifier[j] = (byte) (randomizer.nextInt() & 0xff);
238         }
239 
240         rowsToQualifier.get(rowId).add(qualifier);
241       } else {
242         // same qualifier
243         List<byte[]> previousQualifiers = rowsToQualifier.get(rowId);
244         qualifier = previousQualifiers.get(
245             randomizer.nextInt(previousQualifiers.size()));
246       }
247 
248       if (randomizer.nextFloat() < chanceForZeroValue) {
249         for (int j = 0; j < value.length; ++j) {
250           value[j] = (byte) 0;
251         }
252       } else {
253         randomizer.nextBytes(value);
254       }
255 
256       result.add(new KeyValue(row, family, qualifier, timestamp, value));
257     }
258 
259     Collections.sort(result, KeyValue.COMPARATOR);
260 
261     return result;
262   }
263 
264   /**
265    * Convert list of KeyValues to byte buffer.
266    * @param keyValues list of KeyValues to be converted.
267    * @return buffer with content from key values
268    */
269   public static ByteBuffer convertKvToByteBuffer(List<KeyValue> keyValues,
270       boolean includesMemstoreTS) {
271     int totalSize = 0;
272     for (KeyValue kv : keyValues) {
273       totalSize += kv.getLength();
274       if (includesMemstoreTS) {
275         totalSize += WritableUtils.getVIntSize(kv.getMemstoreTS());
276       }
277     }
278 
279     ByteBuffer result = ByteBuffer.allocate(totalSize);
280     for (KeyValue kv : keyValues) {
281       result.put(kv.getBuffer(), kv.getOffset(), kv.getLength());
282       if (includesMemstoreTS) {
283         ByteBufferUtils.writeVLong(result, kv.getMemstoreTS());
284       }
285     }
286 
287     return result;
288   }
289 
290 }