View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.filter;
20  
21  import com.google.protobuf.InvalidProtocolBufferException;
22  
23  import java.nio.charset.Charset;
24  import java.nio.charset.IllegalCharsetNameException;
25  import java.util.Arrays;
26  import java.util.regex.Pattern;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.hbase.classification.InterfaceAudience;
31  import org.apache.hadoop.hbase.classification.InterfaceStability;
32  import org.apache.hadoop.hbase.exceptions.DeserializationException;
33  import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos;
34  import org.apache.hadoop.hbase.util.Bytes;
35  
36  import org.jcodings.Encoding;
37  import org.jcodings.EncodingDB;
38  import org.jcodings.specific.UTF8Encoding;
39  import org.joni.Matcher;
40  import org.joni.Option;
41  import org.joni.Regex;
42  import org.joni.Syntax;
43  
44  /**
45   * This comparator is for use with {@link CompareFilter} implementations, such
46   * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for
47   * filtering based on the value of a given column. Use it to test if a given
48   * regular expression matches a cell value in the column.
49   * <p>
50   * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator.
51   * <p>
52   * For example:
53   * <p>
54   * <pre>
55   * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
56   *     new RegexStringComparator(
57   *       // v4 IP address
58   *       "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" +
59   *         "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" +
60   *         "|" +
61   *       // v6 IP address
62   *       "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" +
63   *         "{3}[\\d]{1,3})?)(\\/[0-9]+)?"));
64   * </pre>
65   * <p>
66   * Supports {@link java.util.regex.Pattern} flags as well:
67   * <p>
68   * <pre>
69   * ValueFilter vf = new ValueFilter(CompareOp.EQUAL,
70   *     new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL));
71   * </pre>
72   * @see java.util.regex.Pattern
73   */
74  @InterfaceAudience.Public
75  @InterfaceStability.Stable
76  public class RegexStringComparator extends ByteArrayComparable {
77  
78    private static final Log LOG = LogFactory.getLog(RegexStringComparator.class);
79  
80    private Engine engine;
81  
82    /** Engine implementation type (default=JAVA) */
83    @InterfaceAudience.Public
84    @InterfaceStability.Evolving
85    public enum EngineType {
86      JAVA,
87      JONI
88    }
89  
90    /**
91     * Constructor
92     * Adds Pattern.DOTALL to the underlying Pattern
93     * @param expr a valid regular expression
94     */
95    public RegexStringComparator(String expr) {
96      this(expr, Pattern.DOTALL);
97    }
98  
99    /**
100    * Constructor
101    * Adds Pattern.DOTALL to the underlying Pattern
102    * @param expr a valid regular expression
103    * @param engine engine implementation type
104    */
105   public RegexStringComparator(String expr, EngineType engine) {
106     this(expr, Pattern.DOTALL, engine);
107   }
108 
109   /**
110    * Constructor
111    * @param expr a valid regular expression
112    * @param flags java.util.regex.Pattern flags
113    */
114   public RegexStringComparator(String expr, int flags) {
115     this(expr, flags, EngineType.JAVA);
116   }
117 
118   /**
119    * Constructor
120    * @param expr a valid regular expression
121    * @param flags java.util.regex.Pattern flags
122    * @param engine engine implementation type
123    */
124   public RegexStringComparator(String expr, int flags, EngineType engine) {
125     super(Bytes.toBytes(expr));
126     switch (engine) {
127       case JAVA:
128         this.engine = new JavaRegexEngine(expr, flags);
129         break;
130       case JONI:
131         this.engine = new JoniRegexEngine(expr, flags);
132         break;
133     }
134   }
135 
136   /**
137    * Specifies the {@link Charset} to use to convert the row key to a String.
138    * <p>
139    * The row key needs to be converted to a String in order to be matched
140    * against the regular expression.  This method controls which charset is
141    * used to do this conversion.
142    * <p>
143    * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1}
144    * is recommended.
145    * @param charset The charset to use.
146    */
147   public void setCharset(final Charset charset) {
148     engine.setCharset(charset.name());
149   }
150 
151   @Override
152   public int compareTo(byte[] value, int offset, int length) {
153     return engine.compareTo(value, offset, length);
154   }
155 
156   /**
157    * @return The comparator serialized using pb
158    */
159   public byte [] toByteArray() {
160     return engine.toByteArray();
161   }
162 
163   /**
164    * @param pbBytes A pb serialized {@link RegexStringComparator} instance
165    * @return An instance of {@link RegexStringComparator} made from <code>bytes</code>
166    * @throws DeserializationException
167    * @see #toByteArray
168    */
169   public static RegexStringComparator parseFrom(final byte [] pbBytes)
170   throws DeserializationException {
171     ComparatorProtos.RegexStringComparator proto;
172     try {
173       proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
174     } catch (InvalidProtocolBufferException e) {
175       throw new DeserializationException(e);
176     }
177     RegexStringComparator comparator;
178     if (proto.hasEngine()) {
179       EngineType engine = EngineType.valueOf(proto.getEngine());
180       comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(),
181         engine);      
182     } else {
183       comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
184     }
185     String charset = proto.getCharset();
186     if (charset.length() > 0) {
187       try {
188         comparator.getEngine().setCharset(charset);
189       } catch (IllegalCharsetNameException e) {
190         LOG.error("invalid charset", e);
191       }
192     }
193     return comparator;
194   }
195 
196   /**
197    * @param other
198    * @return true if and only if the fields of the comparator that are serialized
199    * are equal to the corresponding fields in other.  Used for testing.
200    */
201   boolean areSerializedFieldsEqual(ByteArrayComparable other) {
202     if (other == this) return true;
203     if (!(other instanceof RegexStringComparator)) return false;
204     RegexStringComparator comparator = (RegexStringComparator)other;
205     return super.areSerializedFieldsEqual(comparator)
206       && engine.getClass().isInstance(comparator.getEngine())
207       && engine.getPattern().equals(comparator.getEngine().getPattern())
208       && engine.getFlags() == comparator.getEngine().getFlags()
209       && engine.getCharset().equals(comparator.getEngine().getCharset());
210   }
211 
212   Engine getEngine() {
213     return engine;
214   }
215 
216   /**
217    * This is an internal interface for abstracting access to different regular
218    * expression matching engines. 
219    */
220   static interface Engine {
221     /**
222      * Returns the string representation of the configured regular expression
223      * for matching
224      */
225     String getPattern();
226     
227     /**
228      * Returns the set of configured match flags, a bit mask that may include
229      * {@link Pattern} flags
230      */
231     int getFlags();
232 
233     /**
234      * Returns the name of the configured charset
235      */
236     String getCharset();
237 
238     /**
239      * Set the charset used when matching
240      * @param charset the name of the desired charset for matching
241      */
242     void setCharset(final String charset);
243 
244     /**
245      * Return the serialized form of the configured matcher
246      */
247     byte [] toByteArray();
248 
249     /**
250      * Match the given input against the configured pattern
251      * @param value the data to be matched
252      * @param offset offset of the data to be matched
253      * @param length length of the data to be matched
254      * @return 0 if a match was made, 1 otherwise
255      */
256     int compareTo(byte[] value, int offset, int length);
257   }
258 
259   /**
260    * Implementation of the Engine interface using Java's Pattern.
261    * <p>
262    * This is the default engine.
263    */
264   static class JavaRegexEngine implements Engine {
265     private Charset charset = Charset.forName("UTF-8");
266     private Pattern pattern;
267 
268     public JavaRegexEngine(String regex, int flags) {
269       this.pattern = Pattern.compile(regex, flags);
270     }
271 
272     @Override
273     public String getPattern() {
274       return pattern.toString();
275     }
276 
277     @Override
278     public int getFlags() {
279       return pattern.flags();
280     }
281 
282     @Override
283     public String getCharset() {
284       return charset.name();
285     }
286 
287     @Override
288     public void setCharset(String charset) {
289       this.charset = Charset.forName(charset);
290     }
291 
292     @Override
293     public int compareTo(byte[] value, int offset, int length) {
294       // Use find() for subsequence match instead of matches() (full sequence
295       // match) to adhere to the principle of least surprise.
296       String tmp;
297       if (length < value.length / 2) {
298         // See HBASE-9428. Make a copy of the relevant part of the byte[],
299         // or the JDK will copy the entire byte[] during String decode
300         tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
301       } else {
302         tmp = new String(value, offset, length, charset);
303       }
304       return pattern.matcher(tmp).find() ? 0 : 1;
305     }
306 
307     @Override
308     public byte[] toByteArray() {
309       ComparatorProtos.RegexStringComparator.Builder builder =
310           ComparatorProtos.RegexStringComparator.newBuilder();
311       builder.setPattern(pattern.pattern());
312       builder.setPatternFlags(pattern.flags());
313       builder.setCharset(charset.name());
314       builder.setEngine(EngineType.JAVA.name());
315       return builder.build().toByteArray();
316     }
317   }
318 
319   /**
320    * Implementation of the Engine interface using Jruby's joni regex engine.
321    * <p>
322    * This engine operates on byte arrays directly so is expected to be more GC
323    * friendly, and reportedly is twice as fast as Java's Pattern engine.
324    * <p>
325    * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and
326    * MULTILINE are supported.
327    */
328   static class JoniRegexEngine implements Engine {
329     private Encoding encoding = UTF8Encoding.INSTANCE;
330     private String regex;
331     private Regex pattern;
332 
333     public JoniRegexEngine(String regex, int flags) {
334       this.regex = regex;
335       byte[] b = Bytes.toBytes(regex);
336       this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
337     }
338 
339     @Override
340     public String getPattern() {
341       return regex;
342     }
343 
344     @Override
345     public int getFlags() {
346       return pattern.getOptions();
347     }
348 
349     @Override
350     public String getCharset() {
351       return encoding.getCharsetName();
352     }
353 
354     @Override
355     public void setCharset(String name) {
356       setEncoding(name);
357     }
358 
359     @Override
360     public int compareTo(byte[] value, int offset, int length) {
361       // Use subsequence match instead of full sequence match to adhere to the
362       // principle of least surprise.
363       Matcher m = pattern.matcher(value);
364       return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
365     }
366 
367     @Override
368     public byte[] toByteArray() {
369       ComparatorProtos.RegexStringComparator.Builder builder =
370           ComparatorProtos.RegexStringComparator.newBuilder();
371         builder.setPattern(regex);
372         builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
373         builder.setCharset(encoding.getCharsetName());
374         builder.setEngine(EngineType.JONI.name());
375         return builder.build().toByteArray();
376     }
377 
378     private int patternToJoniFlags(int flags) {
379       int newFlags = 0;
380       if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
381         newFlags |= Option.IGNORECASE;
382       }
383       if ((flags & Pattern.DOTALL) != 0) {
384         // This does NOT mean Pattern.MULTILINE
385         newFlags |= Option.MULTILINE;
386       }
387       if ((flags & Pattern.MULTILINE) != 0) {
388         // This is what Java 8's Nashorn engine does when using joni and
389         // translating Pattern's MULTILINE flag
390         newFlags &= ~Option.SINGLELINE;
391         newFlags |= Option.NEGATE_SINGLELINE;
392       }
393       return newFlags;
394     }
395 
396     private int joniToPatternFlags(int flags) {
397       int newFlags = 0;
398       if ((flags & Option.IGNORECASE) != 0) {
399         newFlags |= Pattern.CASE_INSENSITIVE;
400       }
401       // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
402       if ((flags & Option.MULTILINE) != 0) {
403         newFlags |= Pattern.DOTALL;
404       }
405       // This means Pattern.MULTILINE. Nice
406       if ((flags & Option.NEGATE_SINGLELINE) != 0) {
407         newFlags |= Pattern.MULTILINE;
408       }
409       return newFlags;
410     }
411 
412     private void setEncoding(String name) {
413       EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
414       if (e != null) {
415         encoding = e.getEncoding();
416       } else {
417         throw new IllegalCharsetNameException(name);
418       }    
419     }
420   }
421 }