View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.filter;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.List;
23  
24  import com.google.common.annotations.VisibleForTesting;
25  import com.google.protobuf.InvalidProtocolBufferException;
26  import org.apache.hadoop.hbase.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.classification.InterfaceStability;
28  import org.apache.hadoop.hbase.Cell;
29  import org.apache.hadoop.hbase.KeyValueUtil;
30  import org.apache.hadoop.hbase.exceptions.DeserializationException;
31  import org.apache.hadoop.hbase.protobuf.generated.FilterProtos;
32  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.BytesBytesPair;
33  import org.apache.hadoop.hbase.util.ByteStringer;
34  import org.apache.hadoop.hbase.util.Bytes;
35  import org.apache.hadoop.hbase.util.Pair;
36  
37  import java.util.ArrayList;
38  import java.util.Arrays;
39  import java.util.List;
40  
41  /**
42   * Filters data based on fuzzy row key. Performs fast-forwards during scanning.
43   * It takes pairs (row key, fuzzy info) to match row keys. Where fuzzy info is
44   * a byte array with 0 or 1 as its values:
45   * <ul>
46   *   <li>
47   *     0 - means that this byte in provided row key is fixed, i.e. row key's byte at same position
48   *         must match
49   *   </li>
50   *   <li>
51   *     1 - means that this byte in provided row key is NOT fixed, i.e. row key's byte at this
52   *         position can be different from the one in provided row key
53   *   </li>
54   * </ul>
55   *
56   *
57   * Example:
58   * Let's assume row key format is userId_actionId_year_month. Length of userId is fixed
59   * and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively.
60   *
61   * Let's assume that we need to fetch all users that performed certain action (encoded as "99")
62   * in Jan of any year. Then the pair (row key, fuzzy info) would be the following:
63   * row key = "????_99_????_01" (one can use any value instead of "?")
64   * fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00"
65   *
66   * I.e. fuzzy info tells the matching mask is "????_99_????_01", where at ? can be any value.
67   *
68   */
69  @InterfaceAudience.Public
70  @InterfaceStability.Evolving
71  public class FuzzyRowFilter extends FilterBase {
72    private List<Pair<byte[], byte[]>> fuzzyKeysData;
73    private boolean done = false;
74  
75    public FuzzyRowFilter(List<Pair<byte[], byte[]>> fuzzyKeysData) {
76      Pair<byte[], byte[]> p;
77      for (int i = 0; i < fuzzyKeysData.size(); i++) {
78        p = fuzzyKeysData.get(i);
79        if (p.getFirst().length != p.getSecond().length) {
80          Pair<String, String> readable = new Pair<String, String>(
81            Bytes.toStringBinary(p.getFirst()),
82            Bytes.toStringBinary(p.getSecond()));
83          throw new IllegalArgumentException("Fuzzy pair lengths do not match: " + readable);
84        }
85      }
86      this.fuzzyKeysData = fuzzyKeysData;
87    }
88  
89    // TODO: possible improvement: save which fuzzy row key to use when providing a hint
90    @Override
91    public ReturnCode filterKeyValue(Cell cell) {
92      // assigning "worst" result first and looking for better options
93      SatisfiesCode bestOption = SatisfiesCode.NO_NEXT;
94      for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
95        SatisfiesCode satisfiesCode = satisfies(isReversed(), cell.getRowArray(),
96          cell.getRowOffset(), cell.getRowLength(), fuzzyData.getFirst(), fuzzyData.getSecond());
97        if (satisfiesCode == SatisfiesCode.YES) {
98          return ReturnCode.INCLUDE;
99        }
100 
101       if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) {
102         bestOption = SatisfiesCode.NEXT_EXISTS;
103       }
104     }
105 
106     if (bestOption == SatisfiesCode.NEXT_EXISTS) {
107       return ReturnCode.SEEK_NEXT_USING_HINT;
108     }
109 
110     // the only unhandled SatisfiesCode is NO_NEXT, i.e. we are done
111     done = true;
112     return ReturnCode.NEXT_ROW;
113   }
114 
115   // Override here explicitly as the method in super class FilterBase might do a KeyValue recreate.
116   // See HBASE-12068
117   @Override
118   public Cell transformCell(Cell v) {
119     return v;
120   }
121 
122   @Override
123   public Cell getNextCellHint(Cell curCell) {
124     byte[] nextRowKey = null;
125     // Searching for the "smallest" row key that satisfies at least one fuzzy row key
126     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
127       byte[] nextRowKeyCandidate = getNextForFuzzyRule(isReversed(), curCell.getRowArray(),
128           curCell.getRowOffset(), curCell.getRowLength(), fuzzyData.getFirst(),
129           fuzzyData.getSecond());
130       if (nextRowKeyCandidate == null) {
131         continue;
132       }
133       if (nextRowKey == null ||
134         (reversed && Bytes.compareTo(nextRowKeyCandidate, nextRowKey) > 0) ||
135         (!reversed && Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0)) {
136         nextRowKey = nextRowKeyCandidate;
137       }
138     }
139 
140     if (!reversed && nextRowKey == null) {
141       // Should never happen for forward scanners; logic in filterKeyValue should return NO_NEXT.
142       // Can happen in reversed scanner when currentKV is just before the next possible match; in
143       // this case, fall back on scanner simply calling KeyValueHeap.next()
144       // TODO: is there a better way than throw exception? (stop the scanner?)
145       throw new IllegalStateException("No next row key that satisfies fuzzy exists when" +
146                                          " getNextKeyHint() is invoked." +
147                                          " Filter: " + this.toString() +
148                                          " currentKV: " + curCell);
149     }
150 
151     return nextRowKey == null ? null : KeyValueUtil.createFirstOnRow(nextRowKey);
152   }
153 
154   @Override
155   public boolean filterAllRemaining() {
156     return done;
157   }
158 
159   /**
160    * @return The filter serialized using pb
161    */
162   public byte [] toByteArray() {
163     FilterProtos.FuzzyRowFilter.Builder builder =
164       FilterProtos.FuzzyRowFilter.newBuilder();
165     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
166       BytesBytesPair.Builder bbpBuilder = BytesBytesPair.newBuilder();
167       bbpBuilder.setFirst(ByteStringer.wrap(fuzzyData.getFirst()));
168       bbpBuilder.setSecond(ByteStringer.wrap(fuzzyData.getSecond()));
169       builder.addFuzzyKeysData(bbpBuilder);
170     }
171     return builder.build().toByteArray();
172   }
173 
174   /**
175    * @param pbBytes A pb serialized {@link FuzzyRowFilter} instance
176    * @return An instance of {@link FuzzyRowFilter} made from <code>bytes</code>
177    * @throws DeserializationException
178    * @see #toByteArray
179    */
180   public static FuzzyRowFilter parseFrom(final byte [] pbBytes)
181   throws DeserializationException {
182     FilterProtos.FuzzyRowFilter proto;
183     try {
184       proto = FilterProtos.FuzzyRowFilter.parseFrom(pbBytes);
185     } catch (InvalidProtocolBufferException e) {
186       throw new DeserializationException(e);
187     }
188     int count = proto.getFuzzyKeysDataCount();
189     ArrayList<Pair<byte[], byte[]>> fuzzyKeysData= new ArrayList<Pair<byte[], byte[]>>(count);
190     for (int i = 0; i < count; ++i) {
191       BytesBytesPair current = proto.getFuzzyKeysData(i);
192       byte[] keyBytes = current.getFirst().toByteArray();
193       byte[] keyMeta = current.getSecond().toByteArray();
194       fuzzyKeysData.add(new Pair<byte[], byte[]>(keyBytes, keyMeta));
195     }
196     return new FuzzyRowFilter(fuzzyKeysData);
197   }
198 
199   @Override
200   public String toString() {
201     final StringBuilder sb = new StringBuilder();
202     sb.append("FuzzyRowFilter");
203     sb.append("{fuzzyKeysData=");
204     for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
205       sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":");
206       sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}');
207     }
208     sb.append("}, ");
209     return sb.toString();
210   }
211 
212   // Utility methods
213 
214   static enum SatisfiesCode {
215     /** row satisfies fuzzy rule */
216     YES,
217     /** row doesn't satisfy fuzzy rule, but there's possible greater row that does */
218     NEXT_EXISTS,
219     /** row doesn't satisfy fuzzy rule and there's no greater row that does */
220     NO_NEXT
221   }
222 
223   @VisibleForTesting
224   static SatisfiesCode satisfies(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
225     return satisfies(false, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
226   }
227 
228   @VisibleForTesting
229   static SatisfiesCode satisfies(boolean reverse, byte[] row, byte[] fuzzyKeyBytes,
230                                  byte[] fuzzyKeyMeta) {
231     return satisfies(reverse, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
232   }
233 
234   private static SatisfiesCode satisfies(boolean reverse, byte[] row, int offset, int length,
235                                          byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
236     if (row == null) {
237       // do nothing, let scan to proceed
238       return SatisfiesCode.YES;
239     }
240 
241     Order order = Order.orderFor(reverse);
242     boolean nextRowKeyCandidateExists = false;
243 
244     for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) {
245       // First, checking if this position is fixed and not equals the given one
246       boolean byteAtPositionFixed = fuzzyKeyMeta[i] == 0;
247       boolean fixedByteIncorrect = byteAtPositionFixed && fuzzyKeyBytes[i] != row[i + offset];
248       if (fixedByteIncorrect) {
249         // in this case there's another row that satisfies fuzzy rule and bigger than this row
250         if (nextRowKeyCandidateExists) {
251           return SatisfiesCode.NEXT_EXISTS;
252         }
253 
254         // If this row byte is less than fixed then there's a byte array bigger than
255         // this row and which satisfies the fuzzy rule. Otherwise there's no such byte array:
256         // this row is simply bigger than any byte array that satisfies the fuzzy rule
257         boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF);
258         if (rowByteLessThanFixed && !reverse) {
259           return SatisfiesCode.NEXT_EXISTS;
260         } else if (!rowByteLessThanFixed && reverse) {
261           return SatisfiesCode.NEXT_EXISTS;
262         } else {
263           return SatisfiesCode.NO_NEXT;
264         }
265       }
266 
267       // Second, checking if this position is not fixed and byte value is not the biggest. In this
268       // case there's a byte array bigger than this row and which satisfies the fuzzy rule. To get
269       // bigger byte array that satisfies the rule we need to just increase this byte
270       // (see the code of getNextForFuzzyRule below) by one.
271       // Note: if non-fixed byte is already at biggest value, this doesn't allow us to say there's
272       //       bigger one that satisfies the rule as it can't be increased.
273       if (fuzzyKeyMeta[i] == 1 && !order.isMax(fuzzyKeyBytes[i])) {
274         nextRowKeyCandidateExists = true;
275       }
276     }
277 
278     return SatisfiesCode.YES;
279   }
280 
281   @VisibleForTesting
282   static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
283     return getNextForFuzzyRule(false, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
284   }
285 
286   @VisibleForTesting
287   static byte[] getNextForFuzzyRule(boolean reverse, byte[] row, byte[] fuzzyKeyBytes,
288                                     byte[] fuzzyKeyMeta) {
289     return getNextForFuzzyRule(reverse, row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
290   }
291 
292   /** Abstracts directional comparisons based on scan direction. */
293   private enum Order {
294     ASC {
295       public boolean lt(int lhs, int rhs) {
296         return lhs < rhs;
297       }
298       public boolean gt(int lhs, int rhs) {
299         return lhs > rhs;
300       }
301       public byte inc(byte val) {
302         // TODO: what about over/underflow?
303         return (byte) (val + 1);
304       }
305       public boolean isMax(byte val) {
306         return val == (byte) 0xff;
307       }
308       public byte min() {
309         return 0;
310       }
311     },
312     DESC {
313       public boolean lt(int lhs, int rhs) {
314         return lhs > rhs;
315       }
316       public boolean gt(int lhs, int rhs) {
317         return lhs < rhs;
318       }
319       public byte inc(byte val) {
320         // TODO: what about over/underflow?
321         return (byte) (val - 1);
322       }
323       public boolean isMax(byte val) {
324         return val == 0;
325       }
326       public byte min() {
327         return (byte) 0xFF;
328       }
329     };
330 
331     public static Order orderFor(boolean reverse) {
332       return reverse ? DESC : ASC;
333     }
334 
335     /** Returns true when {@code lhs < rhs}. */
336     public abstract boolean lt(int lhs, int rhs);
337     /** Returns true when {@code lhs > rhs}. */
338     public abstract boolean gt(int lhs, int rhs);
339     /** Returns {@code val} incremented by 1. */
340     public abstract byte inc(byte val);
341     /** Return true when {@code val} is the maximum value */
342     public abstract boolean isMax(byte val);
343     /** Return the minimum value according to this ordering scheme. */
344     public abstract byte min();
345   }
346 
347   /**
348    * @return greater byte array than given (row) which satisfies the fuzzy rule if it exists,
349    *         null otherwise
350    */
351   private static byte[] getNextForFuzzyRule(boolean reverse, byte[] row, int offset, int length,
352                                             byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
353     // To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" than
354     // the given one we do the following:
355     // 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes
356     // 2. if during the first step given row did not increase, then we increase the value at
357     //    the first "non-fixed" position (where it is not maximum already)
358 
359     // It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position
360     // values than otherwise.
361     byte[] result = Arrays.copyOf(fuzzyKeyBytes,
362                                   length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length);
363     if (reverse && length > fuzzyKeyBytes.length) {
364       // we need trailing 0xff's instead of trailing 0x00's
365       for (int i = fuzzyKeyBytes.length; i < result.length; i++) {
366         result[i] = (byte) 0xFF;
367       }
368     }
369     int toInc = -1;
370     final Order order = Order.orderFor(reverse);
371 
372     boolean increased = false;
373     for (int i = 0; i < result.length; i++) {
374       if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
375         result[i] = row[offset + i];
376         if (!order.isMax(row[i])) {
377           // this is "non-fixed" position and is not at max value, hence we can increase it
378           toInc = i;
379         }
380       } else if (i < fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) {
381         if (order.lt((row[i + offset] & 0xFF), (fuzzyKeyBytes[i] & 0xFF))) {
382           // if setting value for any fixed position increased the original array,
383           // we are OK
384           increased = true;
385           break;
386         }
387 
388         if (order.gt((row[i + offset] & 0xFF), (fuzzyKeyBytes[i] & 0xFF))) {
389           // if setting value for any fixed position makes array "smaller", then just stop:
390           // in case we found some non-fixed position to increase we will do it, otherwise
391           // there's no "next" row key that satisfies fuzzy rule and "greater" than given row
392           break;
393         }
394       }
395     }
396 
397     if (!increased) {
398       if (toInc < 0) {
399         return null;
400       }
401       result[toInc] = order.inc(result[toInc]);
402 
403       // Setting all "non-fixed" positions to zeroes to the right of the one we increased so
404       // that found "next" row key is the smallest possible
405       for (int i = toInc + 1; i < result.length; i++) {
406         if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
407           result[i] = order.min();
408         }
409       }
410     }
411 
412     return result;
413   }
414 
415   /**
416    * @return true if and only if the fields of the filter that are serialized
417    * are equal to the corresponding fields in other.  Used for testing.
418    */
419   boolean areSerializedFieldsEqual(Filter o) {
420     if (o == this) return true;
421     if (!(o instanceof FuzzyRowFilter)) return false;
422 
423     FuzzyRowFilter other = (FuzzyRowFilter)o;
424     if (this.fuzzyKeysData.size() != other.fuzzyKeysData.size()) return false;
425     for (int i = 0; i < fuzzyKeysData.size(); ++i) {
426       Pair<byte[], byte[]> thisData = this.fuzzyKeysData.get(i);
427       Pair<byte[], byte[]> otherData = other.fuzzyKeysData.get(i);
428       if (!(Bytes.equals(thisData.getFirst(), otherData.getFirst())
429         && Bytes.equals(thisData.getSecond(), otherData.getSecond()))) {
430         return false;
431       }
432     }
433     return true;
434   }
435 }