1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.filter;
20
21 import com.google.protobuf.InvalidProtocolBufferException;
22
23 import java.nio.charset.Charset;
24 import java.nio.charset.IllegalCharsetNameException;
25 import java.util.Arrays;
26 import java.util.regex.Pattern;
27
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30 import org.apache.hadoop.hbase.classification.InterfaceAudience;
31 import org.apache.hadoop.hbase.classification.InterfaceStability;
32 import org.apache.hadoop.hbase.exceptions.DeserializationException;
33 import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos;
34 import org.apache.hadoop.hbase.util.Bytes;
35
36 import org.jcodings.Encoding;
37 import org.jcodings.EncodingDB;
38 import org.jcodings.specific.UTF8Encoding;
39 import org.joni.Matcher;
40 import org.joni.Option;
41 import org.joni.Regex;
42 import org.joni.Syntax;
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 @InterfaceAudience.Public
75 @InterfaceStability.Stable
76 public class RegexStringComparator extends ByteArrayComparable {
77
78 private static final Log LOG = LogFactory.getLog(RegexStringComparator.class);
79
80 private Engine engine;
81
82
83 @InterfaceAudience.Public
84 @InterfaceStability.Evolving
85 public enum EngineType {
86 JAVA,
87 JONI
88 }
89
90
91
92
93
94
95 public RegexStringComparator(String expr) {
96 this(expr, Pattern.DOTALL);
97 }
98
99
100
101
102
103
104
105 public RegexStringComparator(String expr, EngineType engine) {
106 this(expr, Pattern.DOTALL, engine);
107 }
108
109
110
111
112
113
114 public RegexStringComparator(String expr, int flags) {
115 this(expr, flags, EngineType.JAVA);
116 }
117
118
119
120
121
122
123
124 public RegexStringComparator(String expr, int flags, EngineType engine) {
125 super(Bytes.toBytes(expr));
126 switch (engine) {
127 case JAVA:
128 this.engine = new JavaRegexEngine(expr, flags);
129 break;
130 case JONI:
131 this.engine = new JoniRegexEngine(expr, flags);
132 break;
133 }
134 }
135
136
137
138
139
140
141
142
143
144
145
146
147 public void setCharset(final Charset charset) {
148 engine.setCharset(charset.name());
149 }
150
151 @Override
152 public int compareTo(byte[] value, int offset, int length) {
153 return engine.compareTo(value, offset, length);
154 }
155
156
157
158
159 public byte [] toByteArray() {
160 return engine.toByteArray();
161 }
162
163
164
165
166
167
168
169 public static RegexStringComparator parseFrom(final byte [] pbBytes)
170 throws DeserializationException {
171 ComparatorProtos.RegexStringComparator proto;
172 try {
173 proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes);
174 } catch (InvalidProtocolBufferException e) {
175 throw new DeserializationException(e);
176 }
177 RegexStringComparator comparator;
178 if (proto.hasEngine()) {
179 EngineType engine = EngineType.valueOf(proto.getEngine());
180 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(),
181 engine);
182 } else {
183 comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
184 }
185 String charset = proto.getCharset();
186 if (charset.length() > 0) {
187 try {
188 comparator.getEngine().setCharset(charset);
189 } catch (IllegalCharsetNameException e) {
190 LOG.error("invalid charset", e);
191 }
192 }
193 return comparator;
194 }
195
196
197
198
199
200
201 boolean areSerializedFieldsEqual(ByteArrayComparable other) {
202 if (other == this) return true;
203 if (!(other instanceof RegexStringComparator)) return false;
204 RegexStringComparator comparator = (RegexStringComparator)other;
205 return super.areSerializedFieldsEqual(comparator)
206 && engine.getClass().isInstance(comparator.getEngine())
207 && engine.getPattern().equals(comparator.getEngine().getPattern())
208 && engine.getFlags() == comparator.getEngine().getFlags()
209 && engine.getCharset().equals(comparator.getEngine().getCharset());
210 }
211
212 Engine getEngine() {
213 return engine;
214 }
215
216
217
218
219
220 static interface Engine {
221
222
223
224
225 String getPattern();
226
227
228
229
230
231 int getFlags();
232
233
234
235
236 String getCharset();
237
238
239
240
241
242 void setCharset(final String charset);
243
244
245
246
247 byte [] toByteArray();
248
249
250
251
252
253
254
255
256 int compareTo(byte[] value, int offset, int length);
257 }
258
259
260
261
262
263
264 static class JavaRegexEngine implements Engine {
265 private Charset charset = Charset.forName("UTF-8");
266 private Pattern pattern;
267
268 public JavaRegexEngine(String regex, int flags) {
269 this.pattern = Pattern.compile(regex, flags);
270 }
271
272 @Override
273 public String getPattern() {
274 return pattern.toString();
275 }
276
277 @Override
278 public int getFlags() {
279 return pattern.flags();
280 }
281
282 @Override
283 public String getCharset() {
284 return charset.name();
285 }
286
287 @Override
288 public void setCharset(String charset) {
289 this.charset = Charset.forName(charset);
290 }
291
292 @Override
293 public int compareTo(byte[] value, int offset, int length) {
294
295
296 String tmp;
297 if (length < value.length / 2) {
298
299
300 tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
301 } else {
302 tmp = new String(value, offset, length, charset);
303 }
304 return pattern.matcher(tmp).find() ? 0 : 1;
305 }
306
307 @Override
308 public byte[] toByteArray() {
309 ComparatorProtos.RegexStringComparator.Builder builder =
310 ComparatorProtos.RegexStringComparator.newBuilder();
311 builder.setPattern(pattern.pattern());
312 builder.setPatternFlags(pattern.flags());
313 builder.setCharset(charset.name());
314 builder.setEngine(EngineType.JAVA.name());
315 return builder.build().toByteArray();
316 }
317 }
318
319
320
321
322
323
324
325
326
327
328 static class JoniRegexEngine implements Engine {
329 private Encoding encoding = UTF8Encoding.INSTANCE;
330 private String regex;
331 private Regex pattern;
332
333 public JoniRegexEngine(String regex, int flags) {
334 this.regex = regex;
335 byte[] b = Bytes.toBytes(regex);
336 this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
337 }
338
339 @Override
340 public String getPattern() {
341 return regex;
342 }
343
344 @Override
345 public int getFlags() {
346 return pattern.getOptions();
347 }
348
349 @Override
350 public String getCharset() {
351 return encoding.getCharsetName();
352 }
353
354 @Override
355 public void setCharset(String name) {
356 setEncoding(name);
357 }
358
359 @Override
360 public int compareTo(byte[] value, int offset, int length) {
361
362
363 Matcher m = pattern.matcher(value);
364 return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
365 }
366
367 @Override
368 public byte[] toByteArray() {
369 ComparatorProtos.RegexStringComparator.Builder builder =
370 ComparatorProtos.RegexStringComparator.newBuilder();
371 builder.setPattern(regex);
372 builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
373 builder.setCharset(encoding.getCharsetName());
374 builder.setEngine(EngineType.JONI.name());
375 return builder.build().toByteArray();
376 }
377
378 private int patternToJoniFlags(int flags) {
379 int newFlags = 0;
380 if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
381 newFlags |= Option.IGNORECASE;
382 }
383 if ((flags & Pattern.DOTALL) != 0) {
384
385 newFlags |= Option.MULTILINE;
386 }
387 if ((flags & Pattern.MULTILINE) != 0) {
388
389
390 newFlags &= ~Option.SINGLELINE;
391 newFlags |= Option.NEGATE_SINGLELINE;
392 }
393 return newFlags;
394 }
395
396 private int joniToPatternFlags(int flags) {
397 int newFlags = 0;
398 if ((flags & Option.IGNORECASE) != 0) {
399 newFlags |= Pattern.CASE_INSENSITIVE;
400 }
401
402 if ((flags & Option.MULTILINE) != 0) {
403 newFlags |= Pattern.DOTALL;
404 }
405
406 if ((flags & Option.NEGATE_SINGLELINE) != 0) {
407 newFlags |= Pattern.MULTILINE;
408 }
409 return newFlags;
410 }
411
412 private void setEncoding(String name) {
413 EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
414 if (e != null) {
415 encoding = e.getEncoding();
416 } else {
417 throw new IllegalCharsetNameException(name);
418 }
419 }
420 }
421 }