1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.validator;
18
19 import java.io.Serializable;
20 import java.util.Arrays;
21 import java.util.HashSet;
22 import java.util.Set;
23
24 import org.apache.commons.validator.util.Flags;
25 import org.apache.oro.text.perl.Perl5Util;
26
27 /***
28 * <p>Validates URLs.</p>
29 * Behavour of validation is modified by passing in options:
30 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
31 * component.</li>
32 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
33 * included then fragments are flagged as illegal.</li>
34 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
35 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
36 *
37 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
38 * http://javascript.internet.com. However, this validation now bears little resemblance
39 * to the php original.</p>
40 * <pre>
41 * Example of usage:
42 * Construct a UrlValidator with valid schemes of "http", and "https".
43 *
44 * String[] schemes = {"http","https"}.
45 * UrlValidator urlValidator = new UrlValidator(schemes);
46 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
47 * System.out.println("url is valid");
48 * } else {
49 * System.out.println("url is invalid");
50 * }
51 *
52 * prints "url is invalid"
53 * If instead the default constructor is used.
54 *
55 * UrlValidator urlValidator = new UrlValidator();
56 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
57 * System.out.println("url is valid");
58 * } else {
59 * System.out.println("url is invalid");
60 * }
61 *
62 * prints out "url is valid"
63 * </pre>
64 *
65 * @see
66 * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
67 * Uniform Resource Identifiers (URI): Generic Syntax
68 * </a>
69 *
70 * @version $Revision: 478334 $ $Date: 2006-11-22 21:31:54 +0000 (Wed, 22 Nov 2006) $
71 * @since Validator 1.1
72 */
73 public class UrlValidator implements Serializable {
74
75 /***
76 * Allows all validly formatted schemes to pass validation instead of
77 * supplying a set of valid schemes.
78 */
79 public static final int ALLOW_ALL_SCHEMES = 1 << 0;
80
81 /***
82 * Allow two slashes in the path component of the URL.
83 */
84 public static final int ALLOW_2_SLASHES = 1 << 1;
85
86 /***
87 * Enabling this options disallows any URL fragments.
88 */
89 public static final int NO_FRAGMENTS = 1 << 2;
90
91 private static final String ALPHA_CHARS = "a-zA-Z";
92
93 private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "//d";
94
95 private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
96
97 private static final String VALID_CHARS = "[^//s" + SPECIAL_CHARS + "]";
98
99 private static final String SCHEME_CHARS = ALPHA_CHARS;
100
101
102 private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "//-//.";
103
104 private static final String ATOM = VALID_CHARS + '+';
105
106 /***
107 * This expression derived/taken from the BNF for URI (RFC2396).
108 */
109 private static final String URL_PATTERN =
110 "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(//?([^#]*))?(#(.*))?/";
111
112
113 /***
114 * Schema/Protocol (ie. http:, ftp:, file:, etc).
115 */
116 private static final int PARSE_URL_SCHEME = 2;
117
118 /***
119 * Includes hostname/ip and port number.
120 */
121 private static final int PARSE_URL_AUTHORITY = 4;
122
123 private static final int PARSE_URL_PATH = 5;
124
125 private static final int PARSE_URL_QUERY = 7;
126
127 private static final int PARSE_URL_FRAGMENT = 9;
128
129 /***
130 * Protocol (ie. http:, ftp:,https:).
131 */
132 private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS + "]/";
133
134 private static final String AUTHORITY_PATTERN =
135 "/^([" + AUTHORITY_CHARS + "]*)(://d*)?(.*)?/";
136
137
138 private static final int PARSE_AUTHORITY_HOST_IP = 1;
139
140 private static final int PARSE_AUTHORITY_PORT = 2;
141
142 /***
143 * Should always be empty.
144 */
145 private static final int PARSE_AUTHORITY_EXTRA = 3;
146
147 private static final String PATH_PATTERN = "/^(/[-//w:@&?=+,.!/~*'%$_;]*)?$/";
148
149 private static final String QUERY_PATTERN = "/^(.*)$/";
150
151 private static final String LEGAL_ASCII_PATTERN = "/^[//000-//177]+$/";
152
153 private static final String IP_V4_DOMAIN_PATTERN =
154 "/^(//d{1,3})[.](//d{1,3})[.](//d{1,3})[.](//d{1,3})$/";
155
156 private static final String DOMAIN_PATTERN =
157 "/^" + ATOM + "(//." + ATOM + ")*$/";
158
159 private static final String PORT_PATTERN = "/^:(//d{1,5})$/";
160
161 private static final String ATOM_PATTERN = "/(" + ATOM + ")/";
162
163 private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS + "]/";
164
165 /***
166 * Holds the set of current validation options.
167 */
168 private Flags options = null;
169
170 /***
171 * The set of schemes that are allowed to be in a URL.
172 */
173 private Set allowedSchemes = new HashSet();
174
175 /***
176 * If no schemes are provided, default to this set.
177 */
178 protected String[] defaultSchemes = {"http", "https", "ftp"};
179
180 /***
181 * Create a UrlValidator with default properties.
182 */
183 public UrlValidator() {
184 this(null);
185 }
186
187 /***
188 * Behavior of validation is modified by passing in several strings options:
189 * @param schemes Pass in one or more url schemes to consider valid, passing in
190 * a null will default to "http,https,ftp" being valid.
191 * If a non-null schemes is specified then all valid schemes must
192 * be specified. Setting the ALLOW_ALL_SCHEMES option will
193 * ignore the contents of schemes.
194 */
195 public UrlValidator(String[] schemes) {
196 this(schemes, 0);
197 }
198
199 /***
200 * Initialize a UrlValidator with the given validation options.
201 * @param options The options should be set using the public constants declared in
202 * this class. To set multiple options you simply add them together. For example,
203 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
204 */
205 public UrlValidator(int options) {
206 this(null, options);
207 }
208
209 /***
210 * Behavour of validation is modified by passing in options:
211 * @param schemes The set of valid schemes.
212 * @param options The options should be set using the public constants declared in
213 * this class. To set multiple options you simply add them together. For example,
214 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
215 */
216 public UrlValidator(String[] schemes, int options) {
217 this.options = new Flags(options);
218
219 if (this.options.isOn(ALLOW_ALL_SCHEMES)) {
220 return;
221 }
222
223 if (schemes == null) {
224 schemes = this.defaultSchemes;
225 }
226
227 this.allowedSchemes.addAll(Arrays.asList(schemes));
228 }
229
230 /***
231 * <p>Checks if a field has a valid url address.</p>
232 *
233 * @param value The value validation is being performed on. A <code>null</code>
234 * value is considered invalid.
235 * @return true if the url is valid.
236 */
237 public boolean isValid(String value) {
238 if (value == null) {
239 return false;
240 }
241
242 Perl5Util matchUrlPat = new Perl5Util();
243 Perl5Util matchAsciiPat = new Perl5Util();
244
245 if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) {
246 return false;
247 }
248
249
250 if (!matchUrlPat.match(URL_PATTERN, value)) {
251 return false;
252 }
253
254 if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) {
255 return false;
256 }
257
258 if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) {
259 return false;
260 }
261
262 if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) {
263 return false;
264 }
265
266 if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) {
267 return false;
268 }
269
270 if (!isValidFragment(matchUrlPat.group(PARSE_URL_FRAGMENT))) {
271 return false;
272 }
273
274 return true;
275 }
276
277 /***
278 * Validate scheme. If schemes[] was initialized to a non null,
279 * then only those scheme's are allowed. Note this is slightly different
280 * than for the constructor.
281 * @param scheme The scheme to validate. A <code>null</code> value is considered
282 * invalid.
283 * @return true if valid.
284 */
285 protected boolean isValidScheme(String scheme) {
286 if (scheme == null) {
287 return false;
288 }
289
290 Perl5Util schemeMatcher = new Perl5Util();
291 if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) {
292 return false;
293 }
294
295 if (this.options.isOff(ALLOW_ALL_SCHEMES)) {
296
297 if (!this.allowedSchemes.contains(scheme)) {
298 return false;
299 }
300 }
301
302 return true;
303 }
304
305 /***
306 * Returns true if the authority is properly formatted. An authority is the combination
307 * of hostname and port. A <code>null</code> authority value is considered invalid.
308 * @param authority Authority value to validate.
309 * @return true if authority (hostname and port) is valid.
310 */
311 protected boolean isValidAuthority(String authority) {
312 if (authority == null) {
313 return false;
314 }
315
316 Perl5Util authorityMatcher = new Perl5Util();
317 Perl5Util matchIPV4Pat = new Perl5Util();
318
319 if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) {
320 return false;
321 }
322
323 boolean ipV4Address = false;
324 boolean hostname = false;
325
326 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
327 ipV4Address = matchIPV4Pat.match(IP_V4_DOMAIN_PATTERN, hostIP);
328
329 if (ipV4Address) {
330
331 for (int i = 1; i <= 4; i++) {
332 String ipSegment = matchIPV4Pat.group(i);
333 if (ipSegment == null || ipSegment.length() <= 0) {
334 return false;
335 }
336
337 try {
338 if (Integer.parseInt(ipSegment) > 255) {
339 return false;
340 }
341 } catch(NumberFormatException e) {
342 return false;
343 }
344
345 }
346 } else {
347
348 Perl5Util domainMatcher = new Perl5Util();
349 hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP);
350 }
351
352
353 if (hostname) {
354
355
356 char[] chars = hostIP.toCharArray();
357 int size = 1;
358 for(int i=0; i<chars.length; i++) {
359 if(chars[i] == '.') {
360 size++;
361 }
362 }
363 String[] domainSegment = new String[size];
364 boolean match = true;
365 int segmentCount = 0;
366 int segmentLength = 0;
367 Perl5Util atomMatcher = new Perl5Util();
368
369 while (match) {
370 match = atomMatcher.match(ATOM_PATTERN, hostIP);
371 if (match) {
372 domainSegment[segmentCount] = atomMatcher.group(1);
373 segmentLength = domainSegment[segmentCount].length() + 1;
374 hostIP =
375 (segmentLength >= hostIP.length())
376 ? ""
377 : hostIP.substring(segmentLength);
378
379 segmentCount++;
380 }
381 }
382 String topLevel = domainSegment[segmentCount - 1];
383 if (topLevel.length() < 2 || topLevel.length() > 4) {
384 return false;
385 }
386
387
388 Perl5Util alphaMatcher = new Perl5Util();
389 if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(0, 1))) {
390 return false;
391 }
392
393
394 if (segmentCount < 2) {
395 return false;
396 }
397 }
398
399 if (!hostname && !ipV4Address) {
400 return false;
401 }
402
403 String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
404 if (port != null) {
405 Perl5Util portMatcher = new Perl5Util();
406 if (!portMatcher.match(PORT_PATTERN, port)) {
407 return false;
408 }
409 }
410
411 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
412 if (!GenericValidator.isBlankOrNull(extra)) {
413 return false;
414 }
415
416 return true;
417 }
418
419 /***
420 * Returns true if the path is valid. A <code>null</code> value is considered invalid.
421 * @param path Path value to validate.
422 * @return true if path is valid.
423 */
424 protected boolean isValidPath(String path) {
425 if (path == null) {
426 return false;
427 }
428
429 Perl5Util pathMatcher = new Perl5Util();
430
431 if (!pathMatcher.match(PATH_PATTERN, path)) {
432 return false;
433 }
434
435 int slash2Count = countToken("//", path);
436 if (this.options.isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
437 return false;
438 }
439
440 int slashCount = countToken("/", path);
441 int dot2Count = countToken("..", path);
442 if (dot2Count > 0) {
443 if ((slashCount - slash2Count - 1) <= dot2Count) {
444 return false;
445 }
446 }
447
448 return true;
449 }
450
451 /***
452 * Returns true if the query is null or it's a properly formatted query string.
453 * @param query Query value to validate.
454 * @return true if query is valid.
455 */
456 protected boolean isValidQuery(String query) {
457 if (query == null) {
458 return true;
459 }
460
461 Perl5Util queryMatcher = new Perl5Util();
462 return queryMatcher.match(QUERY_PATTERN, query);
463 }
464
465 /***
466 * Returns true if the given fragment is null or fragments are allowed.
467 * @param fragment Fragment value to validate.
468 * @return true if fragment is valid.
469 */
470 protected boolean isValidFragment(String fragment) {
471 if (fragment == null) {
472 return true;
473 }
474
475 return this.options.isOff(NO_FRAGMENTS);
476 }
477
478 /***
479 * Returns the number of times the token appears in the target.
480 * @param token Token value to be counted.
481 * @param target Target value to count tokens in.
482 * @return the number of tokens.
483 */
484 protected int countToken(String token, String target) {
485 int tokenIndex = 0;
486 int count = 0;
487 while (tokenIndex != -1) {
488 tokenIndex = target.indexOf(token, tokenIndex);
489 if (tokenIndex > -1) {
490 tokenIndex++;
491 count++;
492 }
493 }
494 return count;
495 }
496 }