1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 package org.apache.commons.httpclient;
31
32 import java.io.IOException;
33 import java.io.ObjectInputStream;
34 import java.io.ObjectOutputStream;
35 import java.io.Serializable;
36 import java.util.Arrays;
37 import java.util.Locale;
38 import java.util.BitSet;
39 import java.util.Hashtable;
40
41 import org.apache.commons.codec.DecoderException;
42 import org.apache.commons.codec.net.URLCodec;
43 import org.apache.commons.httpclient.util.EncodingUtil;
44
45 /***
46 * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
47 * This class has the purpose of supportting of parsing a URI reference to
48 * extend any specific protocols, the character encoding of the protocol to
49 * be transported and the charset of the document.
50 * <p>
51 * A URI is always in an "escaped" form, since escaping or unescaping a
52 * completed URI might change its semantics.
53 * <p>
54 * Implementers should be careful not to escape or unescape the same string
55 * more than once, since unescaping an already unescaped string might lead to
56 * misinterpreting a percent data character as another escaped character,
57 * or vice versa in the case of escaping an already escaped string.
58 * <p>
59 * In order to avoid these problems, data types used as follows:
60 * <p><blockquote><pre>
61 * URI character sequence: char
62 * octet sequence: byte
63 * original character sequence: String
64 * </pre></blockquote><p>
65 *
66 * So, a URI is a sequence of characters as an array of a char type, which
67 * is not always represented as a sequence of octets as an array of byte.
68 * <p>
69 *
70 * URI Syntactic Components
71 * <p><blockquote><pre>
72 * - In general, written as follows:
73 * Absolute URI = <scheme>:<scheme-specific-part>
74 * Generic URI = <scheme>://<authority><path>?<query>
75 *
76 * - Syntax
77 * absoluteURI = scheme ":" ( hier_part | opaque_part )
78 * hier_part = ( net_path | abs_path ) [ "?" query ]
79 * net_path = "//" authority [ abs_path ]
80 * abs_path = "/" path_segments
81 * </pre></blockquote><p>
82 *
83 * The following examples illustrate URI that are in common use.
84 * <pre>
85 * ftp://ftp.is.co.za/rfc/rfc1808.txt
86 * -- ftp scheme for File Transfer Protocol services
87 * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
88 * -- gopher scheme for Gopher and Gopher+ Protocol services
89 * http://www.math.uio.no/faq/compression-faq/part1.html
90 * -- http scheme for Hypertext Transfer Protocol services
91 * mailto:mduerst@ifi.unizh.ch
92 * -- mailto scheme for electronic mail addresses
93 * news:comp.infosystems.www.servers.unix
94 * -- news scheme for USENET news groups and articles
95 * telnet://melvyl.ucop.edu/
96 * -- telnet scheme for interactive services via the TELNET Protocol
97 * </pre>
98 * Please, notice that there are many modifications from URL(RFC 1738) and
99 * relative URL(RFC 1808).
100 * <p>
101 * <b>The expressions for a URI</b>
102 * <p><pre>
103 * For escaped URI forms
104 * - URI(char[]) // constructor
105 * - char[] getRawXxx() // method
106 * - String getEscapedXxx() // method
107 * - String toString() // method
108 * <p>
109 * For unescaped URI forms
110 * - URI(String) // constructor
111 * - String getXXX() // method
112 * </pre><p>
113 *
114 * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
115 * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
116 * @version $Revision: 415961 $ $Date: 2002/03/14 15:14:01
117 */
118 public class URI implements Cloneable, Comparable, Serializable {
119
120
121
122
123 /*** Create an instance as an internal use */
124 protected URI() {
125 }
126
127 /***
128 * Construct a URI from a string with the given charset. The input string can
129 * be either in escaped or unescaped form.
130 *
131 * @param s URI character sequence
132 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
133 * <tt>false</tt> otherwise.
134 * @param charset the charset string to do escape encoding, if required
135 *
136 * @throws URIException If the URI cannot be created.
137 * @throws NullPointerException if input string is <code>null</code>
138 *
139 * @see #getProtocolCharset
140 *
141 * @since 3.0
142 */
143 public URI(String s, boolean escaped, String charset)
144 throws URIException, NullPointerException {
145 protocolCharset = charset;
146 parseUriReference(s, escaped);
147 }
148
149 /***
150 * Construct a URI from a string with the given charset. The input string can
151 * be either in escaped or unescaped form.
152 *
153 * @param s URI character sequence
154 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
155 * <tt>false</tt> otherwise.
156 *
157 * @throws URIException If the URI cannot be created.
158 * @throws NullPointerException if input string is <code>null</code>
159 *
160 * @see #getProtocolCharset
161 *
162 * @since 3.0
163 */
164 public URI(String s, boolean escaped)
165 throws URIException, NullPointerException {
166 parseUriReference(s, escaped);
167 }
168
169 /***
170 * Construct a URI as an escaped form of a character array with the given
171 * charset.
172 *
173 * @param escaped the URI character sequence
174 * @param charset the charset string to do escape encoding
175 * @throws URIException If the URI cannot be created.
176 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
177 * @see #getProtocolCharset
178 *
179 * @deprecated Use #URI(String, boolean, String)
180 */
181 public URI(char[] escaped, String charset)
182 throws URIException, NullPointerException {
183 protocolCharset = charset;
184 parseUriReference(new String(escaped), true);
185 }
186
187
188 /***
189 * Construct a URI as an escaped form of a character array.
190 * An URI can be placed within double-quotes or angle brackets like
191 * "http://test.com/" and <http://test.com/>
192 *
193 * @param escaped the URI character sequence
194 * @throws URIException If the URI cannot be created.
195 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
196 * @see #getDefaultProtocolCharset
197 *
198 * @deprecated Use #URI(String, boolean)
199 */
200 public URI(char[] escaped)
201 throws URIException, NullPointerException {
202 parseUriReference(new String(escaped), true);
203 }
204
205
206 /***
207 * Construct a URI from the given string with the given charset.
208 *
209 * @param original the string to be represented to URI character sequence
210 * It is one of absoluteURI and relativeURI.
211 * @param charset the charset string to do escape encoding
212 * @throws URIException If the URI cannot be created.
213 * @see #getProtocolCharset
214 *
215 * @deprecated Use #URI(String, boolean, String)
216 */
217 public URI(String original, String charset) throws URIException {
218 protocolCharset = charset;
219 parseUriReference(original, false);
220 }
221
222
223 /***
224 * Construct a URI from the given string.
225 * <p><blockquote><pre>
226 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
227 * </pre></blockquote><p>
228 * An URI can be placed within double-quotes or angle brackets like
229 * "http://test.com/" and <http://test.com/>
230 *
231 * @param original the string to be represented to URI character sequence
232 * It is one of absoluteURI and relativeURI.
233 * @throws URIException If the URI cannot be created.
234 * @see #getDefaultProtocolCharset
235 *
236 * @deprecated Use #URI(String, boolean)
237 */
238 public URI(String original) throws URIException {
239 parseUriReference(original, false);
240 }
241
242
243 /***
244 * Construct a general URI from the given components.
245 * <p><blockquote><pre>
246 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
247 * absoluteURI = scheme ":" ( hier_part | opaque_part )
248 * opaque_part = uric_no_slash *uric
249 * </pre></blockquote><p>
250 * It's for absolute URI = <scheme>:<scheme-specific-part>#
251 * <fragment>.
252 *
253 * @param scheme the scheme string
254 * @param schemeSpecificPart scheme_specific_part
255 * @param fragment the fragment string
256 * @throws URIException If the URI cannot be created.
257 * @see #getDefaultProtocolCharset
258 */
259 public URI(String scheme, String schemeSpecificPart, String fragment)
260 throws URIException {
261
262
263 if (scheme == null) {
264 throw new URIException(URIException.PARSING, "scheme required");
265 }
266 char[] s = scheme.toLowerCase().toCharArray();
267 if (validate(s, URI.scheme)) {
268 _scheme = s;
269 } else {
270 throw new URIException(URIException.PARSING, "incorrect scheme");
271 }
272 _opaque = encode(schemeSpecificPart, allowed_opaque_part,
273 getProtocolCharset());
274
275 _is_opaque_part = true;
276 _fragment = fragment == null ? null : fragment.toCharArray();
277 setURI();
278 }
279
280
281 /***
282 * Construct a general URI from the given components.
283 * <p><blockquote><pre>
284 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
285 * absoluteURI = scheme ":" ( hier_part | opaque_part )
286 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
287 * hier_part = ( net_path | abs_path ) [ "?" query ]
288 * </pre></blockquote><p>
289 * It's for absolute URI = <scheme>:<path>?<query>#<
290 * fragment> and relative URI = <path>?<query>#<fragment
291 * >.
292 *
293 * @param scheme the scheme string
294 * @param authority the authority string
295 * @param path the path string
296 * @param query the query string
297 * @param fragment the fragment string
298 * @throws URIException If the new URI cannot be created.
299 * @see #getDefaultProtocolCharset
300 */
301 public URI(String scheme, String authority, String path, String query,
302 String fragment) throws URIException {
303
304
305 StringBuffer buff = new StringBuffer();
306 if (scheme != null) {
307 buff.append(scheme);
308 buff.append(':');
309 }
310 if (authority != null) {
311 buff.append("//");
312 buff.append(authority);
313 }
314 if (path != null) {
315 if ((scheme != null || authority != null)
316 && !path.startsWith("/")) {
317 throw new URIException(URIException.PARSING,
318 "abs_path requested");
319 }
320 buff.append(path);
321 }
322 if (query != null) {
323 buff.append('?');
324 buff.append(query);
325 }
326 if (fragment != null) {
327 buff.append('#');
328 buff.append(fragment);
329 }
330 parseUriReference(buff.toString(), false);
331 }
332
333
334 /***
335 * Construct a general URI from the given components.
336 *
337 * @param scheme the scheme string
338 * @param userinfo the userinfo string
339 * @param host the host string
340 * @param port the port number
341 * @throws URIException If the new URI cannot be created.
342 * @see #getDefaultProtocolCharset
343 */
344 public URI(String scheme, String userinfo, String host, int port)
345 throws URIException {
346
347 this(scheme, userinfo, host, port, null, null, null);
348 }
349
350
351 /***
352 * Construct a general URI from the given components.
353 *
354 * @param scheme the scheme string
355 * @param userinfo the userinfo string
356 * @param host the host string
357 * @param port the port number
358 * @param path the path string
359 * @throws URIException If the new URI cannot be created.
360 * @see #getDefaultProtocolCharset
361 */
362 public URI(String scheme, String userinfo, String host, int port,
363 String path) throws URIException {
364
365 this(scheme, userinfo, host, port, path, null, null);
366 }
367
368
369 /***
370 * Construct a general URI from the given components.
371 *
372 * @param scheme the scheme string
373 * @param userinfo the userinfo string
374 * @param host the host string
375 * @param port the port number
376 * @param path the path string
377 * @param query the query string
378 * @throws URIException If the new URI cannot be created.
379 * @see #getDefaultProtocolCharset
380 */
381 public URI(String scheme, String userinfo, String host, int port,
382 String path, String query) throws URIException {
383
384 this(scheme, userinfo, host, port, path, query, null);
385 }
386
387
388 /***
389 * Construct a general URI from the given components.
390 *
391 * @param scheme the scheme string
392 * @param userinfo the userinfo string
393 * @param host the host string
394 * @param port the port number
395 * @param path the path string
396 * @param query the query string
397 * @param fragment the fragment string
398 * @throws URIException If the new URI cannot be created.
399 * @see #getDefaultProtocolCharset
400 */
401 public URI(String scheme, String userinfo, String host, int port,
402 String path, String query, String fragment) throws URIException {
403
404 this(scheme, (host == null) ? null
405 : ((userinfo != null) ? userinfo + '@' : "") + host
406 + ((port != -1) ? ":" + port : ""), path, query, fragment);
407 }
408
409
410 /***
411 * Construct a general URI from the given components.
412 *
413 * @param scheme the scheme string
414 * @param host the host string
415 * @param path the path string
416 * @param fragment the fragment string
417 * @throws URIException If the new URI cannot be created.
418 * @see #getDefaultProtocolCharset
419 */
420 public URI(String scheme, String host, String path, String fragment)
421 throws URIException {
422
423 this(scheme, host, path, null, fragment);
424 }
425
426
427 /***
428 * Construct a general URI with the given relative URI string.
429 *
430 * @param base the base URI
431 * @param relative the relative URI string
432 * @throws URIException If the new URI cannot be created.
433 *
434 * @deprecated Use #URI(URI, String, boolean)
435 */
436 public URI(URI base, String relative) throws URIException {
437 this(base, new URI(relative));
438 }
439
440
441 /***
442 * Construct a general URI with the given relative URI string.
443 *
444 * @param base the base URI
445 * @param relative the relative URI string
446 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
447 * <tt>false</tt> otherwise.
448 *
449 * @throws URIException If the new URI cannot be created.
450 *
451 * @since 3.0
452 */
453 public URI(URI base, String relative, boolean escaped) throws URIException {
454 this(base, new URI(relative, escaped));
455 }
456
457
458 /***
459 * Construct a general URI with the given relative URI.
460 * <p><blockquote><pre>
461 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
462 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
463 * </pre></blockquote><p>
464 * Resolving Relative References to Absolute Form.
465 *
466 * <strong>Examples of Resolving Relative URI References</strong>
467 *
468 * Within an object with a well-defined base URI of
469 * <p><blockquote><pre>
470 * http://a/b/c/d;p?q
471 * </pre></blockquote><p>
472 * the relative URI would be resolved as follows:
473 *
474 * Normal Examples
475 *
476 * <p><blockquote><pre>
477 * g:h = g:h
478 * g = http://a/b/c/g
479 * ./g = http://a/b/c/g
480 * g/ = http://a/b/c/g/
481 * /g = http://a/g
482 * //g = http://g
483 * ?y = http://a/b/c/?y
484 * g?y = http://a/b/c/g?y
485 * #s = (current document)#s
486 * g#s = http://a/b/c/g#s
487 * g?y#s = http://a/b/c/g?y#s
488 * ;x = http://a/b/c/;x
489 * g;x = http://a/b/c/g;x
490 * g;x?y#s = http://a/b/c/g;x?y#s
491 * . = http://a/b/c/
492 * ./ = http://a/b/c/
493 * .. = http://a/b/
494 * ../ = http://a/b/
495 * ../g = http://a/b/g
496 * ../.. = http://a/
497 * ../../ = http://a/
498 * ../../g = http://a/g
499 * </pre></blockquote><p>
500 *
501 * Some URI schemes do not allow a hierarchical syntax matching the
502 * <hier_part> syntax, and thus cannot use relative references.
503 *
504 * @param base the base URI
505 * @param relative the relative URI
506 * @throws URIException If the new URI cannot be created.
507 */
508 public URI(URI base, URI relative) throws URIException {
509
510 if (base._scheme == null) {
511 throw new URIException(URIException.PARSING, "base URI required");
512 }
513 if (base._scheme != null) {
514 this._scheme = base._scheme;
515 this._authority = base._authority;
516 this._is_net_path = base._is_net_path;
517 }
518 if (base._is_opaque_part || relative._is_opaque_part) {
519 this._scheme = base._scheme;
520 this._is_opaque_part = base._is_opaque_part
521 || relative._is_opaque_part;
522 this._opaque = relative._opaque;
523 this._fragment = relative._fragment;
524 this.setURI();
525 return;
526 }
527 boolean schemesEqual = Arrays.equals(base._scheme,relative._scheme);
528 if (relative._scheme != null
529 && (!schemesEqual || relative._authority != null)) {
530 this._scheme = relative._scheme;
531 this._is_net_path = relative._is_net_path;
532 this._authority = relative._authority;
533 if (relative._is_server) {
534 this._is_server = relative._is_server;
535 this._userinfo = relative._userinfo;
536 this._host = relative._host;
537 this._port = relative._port;
538 } else if (relative._is_reg_name) {
539 this._is_reg_name = relative._is_reg_name;
540 }
541 this._is_abs_path = relative._is_abs_path;
542 this._is_rel_path = relative._is_rel_path;
543 this._path = relative._path;
544 } else if (base._authority != null && relative._scheme == null) {
545 this._is_net_path = base._is_net_path;
546 this._authority = base._authority;
547 if (base._is_server) {
548 this._is_server = base._is_server;
549 this._userinfo = base._userinfo;
550 this._host = base._host;
551 this._port = base._port;
552 } else if (base._is_reg_name) {
553 this._is_reg_name = base._is_reg_name;
554 }
555 }
556 if (relative._authority != null) {
557 this._is_net_path = relative._is_net_path;
558 this._authority = relative._authority;
559 if (relative._is_server) {
560 this._is_server = relative._is_server;
561 this._userinfo = relative._userinfo;
562 this._host = relative._host;
563 this._port = relative._port;
564 } else if (relative._is_reg_name) {
565 this._is_reg_name = relative._is_reg_name;
566 }
567 this._is_abs_path = relative._is_abs_path;
568 this._is_rel_path = relative._is_rel_path;
569 this._path = relative._path;
570 }
571
572 if (relative._authority == null
573 && (relative._scheme == null || schemesEqual)) {
574 if ((relative._path == null || relative._path.length == 0)
575 && relative._query == null) {
576
577
578 this._path = base._path;
579 this._query = base._query;
580 } else {
581 this._path = resolvePath(base._path, relative._path);
582 }
583 }
584
585 if (relative._query != null) {
586 this._query = relative._query;
587 }
588
589 if (relative._fragment != null) {
590 this._fragment = relative._fragment;
591 }
592 this.setURI();
593
594
595 parseUriReference(new String(_uri), true);
596 }
597
598
599
600 /*** Version ID for serialization */
601 static final long serialVersionUID = 604752400577948726L;
602
603
604 /***
605 * Cache the hash code for this URI.
606 */
607 protected int hash = 0;
608
609
610 /***
611 * This Uniform Resource Identifier (URI).
612 * The URI is always in an "escaped" form, since escaping or unescaping
613 * a completed URI might change its semantics.
614 */
615 protected char[] _uri = null;
616
617
618 /***
619 * The charset of the protocol used by this URI instance.
620 */
621 protected String protocolCharset = null;
622
623
624 /***
625 * The default charset of the protocol. RFC 2277, 2396
626 */
627 protected static String defaultProtocolCharset = "UTF-8";
628
629
630 /***
631 * The default charset of the document. RFC 2277, 2396
632 * The platform's charset is used for the document by default.
633 */
634 protected static String defaultDocumentCharset = null;
635 protected static String defaultDocumentCharsetByLocale = null;
636 protected static String defaultDocumentCharsetByPlatform = null;
637
638 static {
639 Locale locale = Locale.getDefault();
640
641 if (locale != null) {
642 defaultDocumentCharsetByLocale =
643 LocaleToCharsetMap.getCharset(locale);
644
645 defaultDocumentCharset = defaultDocumentCharsetByLocale;
646 }
647
648 try {
649 defaultDocumentCharsetByPlatform = System.getProperty("file.encoding");
650 } catch (SecurityException ignore) {
651 }
652 if (defaultDocumentCharset == null) {
653
654 defaultDocumentCharset = defaultDocumentCharsetByPlatform;
655 }
656 }
657
658
659 /***
660 * The scheme.
661 */
662 protected char[] _scheme = null;
663
664
665 /***
666 * The opaque.
667 */
668 protected char[] _opaque = null;
669
670
671 /***
672 * The authority.
673 */
674 protected char[] _authority = null;
675
676
677 /***
678 * The userinfo.
679 */
680 protected char[] _userinfo = null;
681
682
683 /***
684 * The host.
685 */
686 protected char[] _host = null;
687
688
689 /***
690 * The port.
691 */
692 protected int _port = -1;
693
694
695 /***
696 * The path.
697 */
698 protected char[] _path = null;
699
700
701 /***
702 * The query.
703 */
704 protected char[] _query = null;
705
706
707 /***
708 * The fragment.
709 */
710 protected char[] _fragment = null;
711
712
713 /***
714 * The root path.
715 */
716 protected static char[] rootPath = { '/' };
717
718
719
720 /***
721 * The percent "%" character always has the reserved purpose of being the
722 * escape indicator, it must be escaped as "%25" in order to be used as
723 * data within a URI.
724 */
725 protected static final BitSet percent = new BitSet(256);
726
727 static {
728 percent.set('%');
729 }
730
731
732 /***
733 * BitSet for digit.
734 * <p><blockquote><pre>
735 * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
736 * "8" | "9"
737 * </pre></blockquote><p>
738 */
739 protected static final BitSet digit = new BitSet(256);
740
741 static {
742 for (int i = '0'; i <= '9'; i++) {
743 digit.set(i);
744 }
745 }
746
747
748 /***
749 * BitSet for alpha.
750 * <p><blockquote><pre>
751 * alpha = lowalpha | upalpha
752 * </pre></blockquote><p>
753 */
754 protected static final BitSet alpha = new BitSet(256);
755
756 static {
757 for (int i = 'a'; i <= 'z'; i++) {
758 alpha.set(i);
759 }
760 for (int i = 'A'; i <= 'Z'; i++) {
761 alpha.set(i);
762 }
763 }
764
765
766 /***
767 * BitSet for alphanum (join of alpha & digit).
768 * <p><blockquote><pre>
769 * alphanum = alpha | digit
770 * </pre></blockquote><p>
771 */
772 protected static final BitSet alphanum = new BitSet(256);
773
774 static {
775 alphanum.or(alpha);
776 alphanum.or(digit);
777 }
778
779
780 /***
781 * BitSet for hex.
782 * <p><blockquote><pre>
783 * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
784 * "a" | "b" | "c" | "d" | "e" | "f"
785 * </pre></blockquote><p>
786 */
787 protected static final BitSet hex = new BitSet(256);
788
789 static {
790 hex.or(digit);
791 for (int i = 'a'; i <= 'f'; i++) {
792 hex.set(i);
793 }
794 for (int i = 'A'; i <= 'F'; i++) {
795 hex.set(i);
796 }
797 }
798
799
800 /***
801 * BitSet for escaped.
802 * <p><blockquote><pre>
803 * escaped = "%" hex hex
804 * </pre></blockquote><p>
805 */
806 protected static final BitSet escaped = new BitSet(256);
807
808 static {
809 escaped.or(percent);
810 escaped.or(hex);
811 }
812
813
814 /***
815 * BitSet for mark.
816 * <p><blockquote><pre>
817 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
818 * "(" | ")"
819 * </pre></blockquote><p>
820 */
821 protected static final BitSet mark = new BitSet(256);
822
823 static {
824 mark.set('-');
825 mark.set('_');
826 mark.set('.');
827 mark.set('!');
828 mark.set('~');
829 mark.set('*');
830 mark.set('\'');
831 mark.set('(');
832 mark.set(')');
833 }
834
835
836 /***
837 * Data characters that are allowed in a URI but do not have a reserved
838 * purpose are called unreserved.
839 * <p><blockquote><pre>
840 * unreserved = alphanum | mark
841 * </pre></blockquote><p>
842 */
843 protected static final BitSet unreserved = new BitSet(256);
844
845 static {
846 unreserved.or(alphanum);
847 unreserved.or(mark);
848 }
849
850
851 /***
852 * BitSet for reserved.
853 * <p><blockquote><pre>
854 * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
855 * "$" | ","
856 * </pre></blockquote><p>
857 */
858 protected static final BitSet reserved = new BitSet(256);
859
860 static {
861 reserved.set(';');
862 reserved.set('/');
863 reserved.set('?');
864 reserved.set(':');
865 reserved.set('@');
866 reserved.set('&');
867 reserved.set('=');
868 reserved.set('+');
869 reserved.set('$');
870 reserved.set(',');
871 }
872
873
874 /***
875 * BitSet for uric.
876 * <p><blockquote><pre>
877 * uric = reserved | unreserved | escaped
878 * </pre></blockquote><p>
879 */
880 protected static final BitSet uric = new BitSet(256);
881
882 static {
883 uric.or(reserved);
884 uric.or(unreserved);
885 uric.or(escaped);
886 }
887
888
889 /***
890 * BitSet for fragment (alias for uric).
891 * <p><blockquote><pre>
892 * fragment = *uric
893 * </pre></blockquote><p>
894 */
895 protected static final BitSet fragment = uric;
896
897
898 /***
899 * BitSet for query (alias for uric).
900 * <p><blockquote><pre>
901 * query = *uric
902 * </pre></blockquote><p>
903 */
904 protected static final BitSet query = uric;
905
906
907 /***
908 * BitSet for pchar.
909 * <p><blockquote><pre>
910 * pchar = unreserved | escaped |
911 * ":" | "@" | "&" | "=" | "+" | "$" | ","
912 * </pre></blockquote><p>
913 */
914 protected static final BitSet pchar = new BitSet(256);
915
916 static {
917 pchar.or(unreserved);
918 pchar.or(escaped);
919 pchar.set(':');
920 pchar.set('@');
921 pchar.set('&');
922 pchar.set('=');
923 pchar.set('+');
924 pchar.set('$');
925 pchar.set(',');
926 }
927
928
929 /***
930 * BitSet for param (alias for pchar).
931 * <p><blockquote><pre>
932 * param = *pchar
933 * </pre></blockquote><p>
934 */
935 protected static final BitSet param = pchar;
936
937
938 /***
939 * BitSet for segment.
940 * <p><blockquote><pre>
941 * segment = *pchar *( ";" param )
942 * </pre></blockquote><p>
943 */
944 protected static final BitSet segment = new BitSet(256);
945
946 static {
947 segment.or(pchar);
948 segment.set(';');
949 segment.or(param);
950 }
951
952
953 /***
954 * BitSet for path segments.
955 * <p><blockquote><pre>
956 * path_segments = segment *( "/" segment )
957 * </pre></blockquote><p>
958 */
959 protected static final BitSet path_segments = new BitSet(256);
960
961 static {
962 path_segments.set('/');
963 path_segments.or(segment);
964 }
965
966
967 /***
968 * URI absolute path.
969 * <p><blockquote><pre>
970 * abs_path = "/" path_segments
971 * </pre></blockquote><p>
972 */
973 protected static final BitSet abs_path = new BitSet(256);
974
975 static {
976 abs_path.set('/');
977 abs_path.or(path_segments);
978 }
979
980
981 /***
982 * URI bitset for encoding typical non-slash characters.
983 * <p><blockquote><pre>
984 * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
985 * "&" | "=" | "+" | "$" | ","
986 * </pre></blockquote><p>
987 */
988 protected static final BitSet uric_no_slash = new BitSet(256);
989
990 static {
991 uric_no_slash.or(unreserved);
992 uric_no_slash.or(escaped);
993 uric_no_slash.set(';');
994 uric_no_slash.set('?');
995 uric_no_slash.set(';');
996 uric_no_slash.set('@');
997 uric_no_slash.set('&');
998 uric_no_slash.set('=');
999 uric_no_slash.set('+');
1000 uric_no_slash.set('$');
1001 uric_no_slash.set(',');
1002 }
1003
1004
1005 /***
1006 * URI bitset that combines uric_no_slash and uric.
1007 * <p><blockquote><pre>
1008 * opaque_part = uric_no_slash *uric
1009 * </pre></blockquote><p>
1010 */
1011 protected static final BitSet opaque_part = new BitSet(256);
1012
1013 static {
1014
1015 opaque_part.or(uric_no_slash);
1016 opaque_part.or(uric);
1017 }
1018
1019
1020 /***
1021 * URI bitset that combines absolute path and opaque part.
1022 * <p><blockquote><pre>
1023 * path = [ abs_path | opaque_part ]
1024 * </pre></blockquote><p>
1025 */
1026 protected static final BitSet path = new BitSet(256);
1027
1028 static {
1029 path.or(abs_path);
1030 path.or(opaque_part);
1031 }
1032
1033
1034 /***
1035 * Port, a logical alias for digit.
1036 */
1037 protected static final BitSet port = digit;
1038
1039
1040 /***
1041 * Bitset that combines digit and dot fo IPv$address.
1042 * <p><blockquote><pre>
1043 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
1044 * </pre></blockquote><p>
1045 */
1046 protected static final BitSet IPv4address = new BitSet(256);
1047
1048 static {
1049 IPv4address.or(digit);
1050 IPv4address.set('.');
1051 }
1052
1053
1054 /***
1055 * RFC 2373.
1056 * <p><blockquote><pre>
1057 * IPv6address = hexpart [ ":" IPv4address ]
1058 * </pre></blockquote><p>
1059 */
1060 protected static final BitSet IPv6address = new BitSet(256);
1061
1062 static {
1063 IPv6address.or(hex);
1064 IPv6address.set(':');
1065 IPv6address.or(IPv4address);
1066 }
1067
1068
1069 /***
1070 * RFC 2732, 2373.
1071 * <p><blockquote><pre>
1072 * IPv6reference = "[" IPv6address "]"
1073 * </pre></blockquote><p>
1074 */
1075 protected static final BitSet IPv6reference = new BitSet(256);
1076
1077 static {
1078 IPv6reference.set('[');
1079 IPv6reference.or(IPv6address);
1080 IPv6reference.set(']');
1081 }
1082
1083
1084 /***
1085 * BitSet for toplabel.
1086 * <p><blockquote><pre>
1087 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
1088 * </pre></blockquote><p>
1089 */
1090 protected static final BitSet toplabel = new BitSet(256);
1091
1092 static {
1093 toplabel.or(alphanum);
1094 toplabel.set('-');
1095 }
1096
1097
1098 /***
1099 * BitSet for domainlabel.
1100 * <p><blockquote><pre>
1101 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
1102 * </pre></blockquote><p>
1103 */
1104 protected static final BitSet domainlabel = toplabel;
1105
1106
1107 /***
1108 * BitSet for hostname.
1109 * <p><blockquote><pre>
1110 * hostname = *( domainlabel "." ) toplabel [ "." ]
1111 * </pre></blockquote><p>
1112 */
1113 protected static final BitSet hostname = new BitSet(256);
1114
1115 static {
1116 hostname.or(toplabel);
1117
1118 hostname.set('.');
1119 }
1120
1121
1122 /***
1123 * BitSet for host.
1124 * <p><blockquote><pre>
1125 * host = hostname | IPv4address | IPv6reference
1126 * </pre></blockquote><p>
1127 */
1128 protected static final BitSet host = new BitSet(256);
1129
1130 static {
1131 host.or(hostname);
1132
1133 host.or(IPv6reference);
1134 }
1135
1136
1137 /***
1138 * BitSet for hostport.
1139 * <p><blockquote><pre>
1140 * hostport = host [ ":" port ]
1141 * </pre></blockquote><p>
1142 */
1143 protected static final BitSet hostport = new BitSet(256);
1144
1145 static {
1146 hostport.or(host);
1147 hostport.set(':');
1148 hostport.or(port);
1149 }
1150
1151
1152 /***
1153 * Bitset for userinfo.
1154 * <p><blockquote><pre>
1155 * userinfo = *( unreserved | escaped |
1156 * ";" | ":" | "&" | "=" | "+" | "$" | "," )
1157 * </pre></blockquote><p>
1158 */
1159 protected static final BitSet userinfo = new BitSet(256);
1160
1161 static {
1162 userinfo.or(unreserved);
1163 userinfo.or(escaped);
1164 userinfo.set(';');
1165 userinfo.set(':');
1166 userinfo.set('&');
1167 userinfo.set('=');
1168 userinfo.set('+');
1169 userinfo.set('$');
1170 userinfo.set(',');
1171 }
1172
1173
1174 /***
1175 * BitSet for within the userinfo component like user and password.
1176 */
1177 public static final BitSet within_userinfo = new BitSet(256);
1178
1179 static {
1180 within_userinfo.or(userinfo);
1181 within_userinfo.clear(';');
1182 within_userinfo.clear(':');
1183 within_userinfo.clear('@');
1184 within_userinfo.clear('?');
1185 within_userinfo.clear('/');
1186 }
1187
1188
1189 /***
1190 * Bitset for server.
1191 * <p><blockquote><pre>
1192 * server = [ [ userinfo "@" ] hostport ]
1193 * </pre></blockquote><p>
1194 */
1195 protected static final BitSet server = new BitSet(256);
1196
1197 static {
1198 server.or(userinfo);
1199 server.set('@');
1200 server.or(hostport);
1201 }
1202
1203
1204 /***
1205 * BitSet for reg_name.
1206 * <p><blockquote><pre>
1207 * reg_name = 1*( unreserved | escaped | "$" | "," |
1208 * ";" | ":" | "@" | "&" | "=" | "+" )
1209 * </pre></blockquote><p>
1210 */
1211 protected static final BitSet reg_name = new BitSet(256);
1212
1213 static {
1214 reg_name.or(unreserved);
1215 reg_name.or(escaped);
1216 reg_name.set('$');
1217 reg_name.set(',');
1218 reg_name.set(';');
1219 reg_name.set(':');
1220 reg_name.set('@');
1221 reg_name.set('&');
1222 reg_name.set('=');
1223 reg_name.set('+');
1224 }
1225
1226
1227 /***
1228 * BitSet for authority.
1229 * <p><blockquote><pre>
1230 * authority = server | reg_name
1231 * </pre></blockquote><p>
1232 */
1233 protected static final BitSet authority = new BitSet(256);
1234
1235 static {
1236 authority.or(server);
1237 authority.or(reg_name);
1238 }
1239
1240
1241 /***
1242 * BitSet for scheme.
1243 * <p><blockquote><pre>
1244 * scheme = alpha *( alpha | digit | "+" | "-" | "." )
1245 * </pre></blockquote><p>
1246 */
1247 protected static final BitSet scheme = new BitSet(256);
1248
1249 static {
1250 scheme.or(alpha);
1251 scheme.or(digit);
1252 scheme.set('+');
1253 scheme.set('-');
1254 scheme.set('.');
1255 }
1256
1257
1258 /***
1259 * BitSet for rel_segment.
1260 * <p><blockquote><pre>
1261 * rel_segment = 1*( unreserved | escaped |
1262 * ";" | "@" | "&" | "=" | "+" | "$" | "," )
1263 * </pre></blockquote><p>
1264 */
1265 protected static final BitSet rel_segment = new BitSet(256);
1266
1267 static {
1268 rel_segment.or(unreserved);
1269 rel_segment.or(escaped);
1270 rel_segment.set(';');
1271 rel_segment.set('@');
1272 rel_segment.set('&');
1273 rel_segment.set('=');
1274 rel_segment.set('+');
1275 rel_segment.set('$');
1276 rel_segment.set(',');
1277 }
1278
1279
1280 /***
1281 * BitSet for rel_path.
1282 * <p><blockquote><pre>
1283 * rel_path = rel_segment [ abs_path ]
1284 * </pre></blockquote><p>
1285 */
1286 protected static final BitSet rel_path = new BitSet(256);
1287
1288 static {
1289 rel_path.or(rel_segment);
1290 rel_path.or(abs_path);
1291 }
1292
1293
1294 /***
1295 * BitSet for net_path.
1296 * <p><blockquote><pre>
1297 * net_path = "//" authority [ abs_path ]
1298 * </pre></blockquote><p>
1299 */
1300 protected static final BitSet net_path = new BitSet(256);
1301
1302 static {
1303 net_path.set('/');
1304 net_path.or(authority);
1305 net_path.or(abs_path);
1306 }
1307
1308
1309 /***
1310 * BitSet for hier_part.
1311 * <p><blockquote><pre>
1312 * hier_part = ( net_path | abs_path ) [ "?" query ]
1313 * </pre></blockquote><p>
1314 */
1315 protected static final BitSet hier_part = new BitSet(256);
1316
1317 static {
1318 hier_part.or(net_path);
1319 hier_part.or(abs_path);
1320
1321 hier_part.or(query);
1322 }
1323
1324
1325 /***
1326 * BitSet for relativeURI.
1327 * <p><blockquote><pre>
1328 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
1329 * </pre></blockquote><p>
1330 */
1331 protected static final BitSet relativeURI = new BitSet(256);
1332
1333 static {
1334 relativeURI.or(net_path);
1335 relativeURI.or(abs_path);
1336 relativeURI.or(rel_path);
1337
1338 relativeURI.or(query);
1339 }
1340
1341
1342 /***
1343 * BitSet for absoluteURI.
1344 * <p><blockquote><pre>
1345 * absoluteURI = scheme ":" ( hier_part | opaque_part )
1346 * </pre></blockquote><p>
1347 */
1348 protected static final BitSet absoluteURI = new BitSet(256);
1349
1350 static {
1351 absoluteURI.or(scheme);
1352 absoluteURI.set(':');
1353 absoluteURI.or(hier_part);
1354 absoluteURI.or(opaque_part);
1355 }
1356
1357
1358 /***
1359 * BitSet for URI-reference.
1360 * <p><blockquote><pre>
1361 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1362 * </pre></blockquote><p>
1363 */
1364 protected static final BitSet URI_reference = new BitSet(256);
1365
1366 static {
1367 URI_reference.or(absoluteURI);
1368 URI_reference.or(relativeURI);
1369 URI_reference.set('#');
1370 URI_reference.or(fragment);
1371 }
1372
1373
1374
1375
1376 /***
1377 * BitSet for control.
1378 */
1379 public static final BitSet control = new BitSet(256);
1380
1381 static {
1382 for (int i = 0; i <= 0x1F; i++) {
1383 control.set(i);
1384 }
1385 control.set(0x7F);
1386 }
1387
1388 /***
1389 * BitSet for space.
1390 */
1391 public static final BitSet space = new BitSet(256);
1392
1393 static {
1394 space.set(0x20);
1395 }
1396
1397
1398 /***
1399 * BitSet for delims.
1400 */
1401 public static final BitSet delims = new BitSet(256);
1402
1403 static {
1404 delims.set('<');
1405 delims.set('>');
1406 delims.set('#');
1407 delims.set('%');
1408 delims.set('"');
1409 }
1410
1411
1412 /***
1413 * BitSet for unwise.
1414 */
1415 public static final BitSet unwise = new BitSet(256);
1416
1417 static {
1418 unwise.set('{');
1419 unwise.set('}');
1420 unwise.set('|');
1421 unwise.set('//');
1422 unwise.set('^');
1423 unwise.set('[');
1424 unwise.set(']');
1425 unwise.set('`');
1426 }
1427
1428
1429 /***
1430 * Disallowed rel_path before escaping.
1431 */
1432 public static final BitSet disallowed_rel_path = new BitSet(256);
1433
1434 static {
1435 disallowed_rel_path.or(uric);
1436 disallowed_rel_path.andNot(rel_path);
1437 }
1438
1439
1440 /***
1441 * Disallowed opaque_part before escaping.
1442 */
1443 public static final BitSet disallowed_opaque_part = new BitSet(256);
1444
1445 static {
1446 disallowed_opaque_part.or(uric);
1447 disallowed_opaque_part.andNot(opaque_part);
1448 }
1449
1450
1451
1452 /***
1453 * Those characters that are allowed for the authority component.
1454 */
1455 public static final BitSet allowed_authority = new BitSet(256);
1456
1457 static {
1458 allowed_authority.or(authority);
1459 allowed_authority.clear('%');
1460 }
1461
1462
1463 /***
1464 * Those characters that are allowed for the opaque_part.
1465 */
1466 public static final BitSet allowed_opaque_part = new BitSet(256);
1467
1468 static {
1469 allowed_opaque_part.or(opaque_part);
1470 allowed_opaque_part.clear('%');
1471 }
1472
1473
1474 /***
1475 * Those characters that are allowed for the reg_name.
1476 */
1477 public static final BitSet allowed_reg_name = new BitSet(256);
1478
1479 static {
1480 allowed_reg_name.or(reg_name);
1481
1482 allowed_reg_name.clear('%');
1483 }
1484
1485
1486 /***
1487 * Those characters that are allowed for the userinfo component.
1488 */
1489 public static final BitSet allowed_userinfo = new BitSet(256);
1490
1491 static {
1492 allowed_userinfo.or(userinfo);
1493
1494 allowed_userinfo.clear('%');
1495 }
1496
1497
1498 /***
1499 * Those characters that are allowed for within the userinfo component.
1500 */
1501 public static final BitSet allowed_within_userinfo = new BitSet(256);
1502
1503 static {
1504 allowed_within_userinfo.or(within_userinfo);
1505 allowed_within_userinfo.clear('%');
1506 }
1507
1508
1509 /***
1510 * Those characters that are allowed for the IPv6reference component.
1511 * The characters '[', ']' in IPv6reference should be excluded.
1512 */
1513 public static final BitSet allowed_IPv6reference = new BitSet(256);
1514
1515 static {
1516 allowed_IPv6reference.or(IPv6reference);
1517
1518 allowed_IPv6reference.clear('[');
1519 allowed_IPv6reference.clear(']');
1520 }
1521
1522
1523 /***
1524 * Those characters that are allowed for the host component.
1525 * The characters '[', ']' in IPv6reference should be excluded.
1526 */
1527 public static final BitSet allowed_host = new BitSet(256);
1528
1529 static {
1530 allowed_host.or(hostname);
1531 allowed_host.or(allowed_IPv6reference);
1532 }
1533
1534
1535 /***
1536 * Those characters that are allowed for the authority component.
1537 */
1538 public static final BitSet allowed_within_authority = new BitSet(256);
1539
1540 static {
1541 allowed_within_authority.or(server);
1542 allowed_within_authority.or(reg_name);
1543 allowed_within_authority.clear(';');
1544 allowed_within_authority.clear(':');
1545 allowed_within_authority.clear('@');
1546 allowed_within_authority.clear('?');
1547 allowed_within_authority.clear('/');
1548 }
1549
1550
1551 /***
1552 * Those characters that are allowed for the abs_path.
1553 */
1554 public static final BitSet allowed_abs_path = new BitSet(256);
1555
1556 static {
1557 allowed_abs_path.or(abs_path);
1558
1559 allowed_abs_path.andNot(percent);
1560 allowed_abs_path.clear('+');
1561 }
1562
1563
1564 /***
1565 * Those characters that are allowed for the rel_path.
1566 */
1567 public static final BitSet allowed_rel_path = new BitSet(256);
1568
1569 static {
1570 allowed_rel_path.or(rel_path);
1571 allowed_rel_path.clear('%');
1572 allowed_rel_path.clear('+');
1573 }
1574
1575
1576 /***
1577 * Those characters that are allowed within the path.
1578 */
1579 public static final BitSet allowed_within_path = new BitSet(256);
1580
1581 static {
1582 allowed_within_path.or(abs_path);
1583 allowed_within_path.clear('/');
1584 allowed_within_path.clear(';');
1585 allowed_within_path.clear('=');
1586 allowed_within_path.clear('?');
1587 }
1588
1589
1590 /***
1591 * Those characters that are allowed for the query component.
1592 */
1593 public static final BitSet allowed_query = new BitSet(256);
1594
1595 static {
1596 allowed_query.or(uric);
1597 allowed_query.clear('%');
1598 }
1599
1600
1601 /***
1602 * Those characters that are allowed within the query component.
1603 */
1604 public static final BitSet allowed_within_query = new BitSet(256);
1605
1606 static {
1607 allowed_within_query.or(allowed_query);
1608 allowed_within_query.andNot(reserved);
1609 }
1610
1611
1612 /***
1613 * Those characters that are allowed for the fragment component.
1614 */
1615 public static final BitSet allowed_fragment = new BitSet(256);
1616
1617 static {
1618 allowed_fragment.or(uric);
1619 allowed_fragment.clear('%');
1620 }
1621
1622
1623
1624
1625
1626
1627
1628 protected boolean _is_hier_part;
1629 protected boolean _is_opaque_part;
1630
1631
1632 protected boolean _is_net_path;
1633 protected boolean _is_abs_path;
1634 protected boolean _is_rel_path;
1635
1636
1637 protected boolean _is_reg_name;
1638 protected boolean _is_server;
1639
1640
1641 protected boolean _is_hostname;
1642 protected boolean _is_IPv4address;
1643 protected boolean _is_IPv6reference;
1644
1645
1646
1647 /***
1648 * Encodes URI string.
1649 *
1650 * This is a two mapping, one from original characters to octets, and
1651 * subsequently a second from octets to URI characters:
1652 * <p><blockquote><pre>
1653 * original character sequence->octet sequence->URI character sequence
1654 * </pre></blockquote><p>
1655 *
1656 * An escaped octet is encoded as a character triplet, consisting of the
1657 * percent character "%" followed by the two hexadecimal digits
1658 * representing the octet code. For example, "%20" is the escaped
1659 * encoding for the US-ASCII space character.
1660 * <p>
1661 * Conversion from the local filesystem character set to UTF-8 will
1662 * normally involve a two step process. First convert the local character
1663 * set to the UCS; then convert the UCS to UTF-8.
1664 * The first step in the process can be performed by maintaining a mapping
1665 * table that includes the local character set code and the corresponding
1666 * UCS code.
1667 * The next step is to convert the UCS character code to the UTF-8 encoding.
1668 * <p>
1669 * Mapping between vendor codepages can be done in a very similar manner
1670 * as described above.
1671 * <p>
1672 * The only time escape encodings can allowedly be made is when a URI is
1673 * being created from its component parts. The escape and validate methods
1674 * are internally performed within this method.
1675 *
1676 * @param original the original character sequence
1677 * @param allowed those characters that are allowed within a component
1678 * @param charset the protocol charset
1679 * @return URI character sequence
1680 * @throws URIException null component or unsupported character encoding
1681 */
1682
1683 protected static char[] encode(String original, BitSet allowed,
1684 String charset) throws URIException {
1685 if (original == null) {
1686 throw new IllegalArgumentException("Original string may not be null");
1687 }
1688 if (allowed == null) {
1689 throw new IllegalArgumentException("Allowed bitset may not be null");
1690 }
1691 byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset));
1692 return EncodingUtil.getAsciiString(rawdata).toCharArray();
1693 }
1694
1695 /***
1696 * Decodes URI encoded string.
1697 *
1698 * This is a two mapping, one from URI characters to octets, and
1699 * subsequently a second from octets to original characters:
1700 * <p><blockquote><pre>
1701 * URI character sequence->octet sequence->original character sequence
1702 * </pre></blockquote><p>
1703 *
1704 * A URI must be separated into its components before the escaped
1705 * characters within those components can be allowedly decoded.
1706 * <p>
1707 * Notice that there is a chance that URI characters that are non UTF-8
1708 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1709 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1710 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1711 * false reading.
1712 * <p>
1713 * The percent "%" character always has the reserved purpose of being
1714 * the escape indicator, it must be escaped as "%25" in order to be used
1715 * as data within a URI.
1716 * <p>
1717 * The unescape method is internally performed within this method.
1718 *
1719 * @param component the URI character sequence
1720 * @param charset the protocol charset
1721 * @return original character sequence
1722 * @throws URIException incomplete trailing escape pattern or unsupported
1723 * character encoding
1724 */
1725 protected static String decode(char[] component, String charset)
1726 throws URIException {
1727 if (component == null) {
1728 throw new IllegalArgumentException("Component array of chars may not be null");
1729 }
1730 return decode(new String(component), charset);
1731 }
1732
1733 /***
1734 * Decodes URI encoded string.
1735 *
1736 * This is a two mapping, one from URI characters to octets, and
1737 * subsequently a second from octets to original characters:
1738 * <p><blockquote><pre>
1739 * URI character sequence->octet sequence->original character sequence
1740 * </pre></blockquote><p>
1741 *
1742 * A URI must be separated into its components before the escaped
1743 * characters within those components can be allowedly decoded.
1744 * <p>
1745 * Notice that there is a chance that URI characters that are non UTF-8
1746 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1747 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1748 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1749 * false reading.
1750 * <p>
1751 * The percent "%" character always has the reserved purpose of being
1752 * the escape indicator, it must be escaped as "%25" in order to be used
1753 * as data within a URI.
1754 * <p>
1755 * The unescape method is internally performed within this method.
1756 *
1757 * @param component the URI character sequence
1758 * @param charset the protocol charset
1759 * @return original character sequence
1760 * @throws URIException incomplete trailing escape pattern or unsupported
1761 * character encoding
1762 *
1763 * @since 3.0
1764 */
1765 protected static String decode(String component, String charset)
1766 throws URIException {
1767 if (component == null) {
1768 throw new IllegalArgumentException("Component array of chars may not be null");
1769 }
1770 byte[] rawdata = null;
1771 try {
1772 rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(component));
1773 } catch (DecoderException e) {
1774 throw new URIException(e.getMessage());
1775 }
1776 return EncodingUtil.getString(rawdata, charset);
1777 }
1778 /***
1779 * Pre-validate the unescaped URI string within a specific component.
1780 *
1781 * @param component the component string within the component
1782 * @param disallowed those characters disallowed within the component
1783 * @return if true, it doesn't have the disallowed characters
1784 * if false, the component is undefined or an incorrect one
1785 */
1786 protected boolean prevalidate(String component, BitSet disallowed) {
1787
1788 if (component == null) {
1789 return false;
1790 }
1791 char[] target = component.toCharArray();
1792 for (int i = 0; i < target.length; i++) {
1793 if (disallowed.get(target[i])) {
1794 return false;
1795 }
1796 }
1797 return true;
1798 }
1799
1800
1801 /***
1802 * Validate the URI characters within a specific component.
1803 * The component must be performed after escape encoding. Or it doesn't
1804 * include escaped characters.
1805 *
1806 * @param component the characters sequence within the component
1807 * @param generous those characters that are allowed within a component
1808 * @return if true, it's the correct URI character sequence
1809 */
1810 protected boolean validate(char[] component, BitSet generous) {
1811
1812 return validate(component, 0, -1, generous);
1813 }
1814
1815
1816 /***
1817 * Validate the URI characters within a specific component.
1818 * The component must be performed after escape encoding. Or it doesn't
1819 * include escaped characters.
1820 * <p>
1821 * It's not that much strict, generous. The strict validation might be
1822 * performed before being called this method.
1823 *
1824 * @param component the characters sequence within the component
1825 * @param soffset the starting offset of the given component
1826 * @param eoffset the ending offset of the given component
1827 * if -1, it means the length of the component
1828 * @param generous those characters that are allowed within a component
1829 * @return if true, it's the correct URI character sequence
1830 */
1831 protected boolean validate(char[] component, int soffset, int eoffset,
1832 BitSet generous) {
1833
1834 if (eoffset == -1) {
1835 eoffset = component.length - 1;
1836 }
1837 for (int i = soffset; i <= eoffset; i++) {
1838 if (!generous.get(component[i])) {
1839 return false;
1840 }
1841 }
1842 return true;
1843 }
1844
1845
1846 /***
1847 * In order to avoid any possilbity of conflict with non-ASCII characters,
1848 * Parse a URI reference as a <code>String</code> with the character
1849 * encoding of the local system or the document.
1850 * <p>
1851 * The following line is the regular expression for breaking-down a URI
1852 * reference into its components.
1853 * <p><blockquote><pre>
1854 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1855 * 12 3 4 5 6 7 8 9
1856 * </pre></blockquote><p>
1857 * For example, matching the above expression to
1858 * http://jakarta.apache.org/ietf/uri/#Related
1859 * results in the following subexpression matches:
1860 * <p><blockquote><pre>
1861 * $1 = http:
1862 * scheme = $2 = http
1863 * $3 = //jakarta.apache.org
1864 * authority = $4 = jakarta.apache.org
1865 * path = $5 = /ietf/uri/
1866 * $6 = <undefined>
1867 * query = $7 = <undefined>
1868 * $8 = #Related
1869 * fragment = $9 = Related
1870 * </pre></blockquote><p>
1871 *
1872 * @param original the original character sequence
1873 * @param escaped <code>true</code> if <code>original</code> is escaped
1874 * @throws URIException If an error occurs.
1875 */
1876 protected void parseUriReference(String original, boolean escaped)
1877 throws URIException {
1878
1879
1880 if (original == null) {
1881 throw new URIException("URI-Reference required");
1882 }
1883
1884
1885
1886
1887 String tmp = original.trim();
1888
1889
1890
1891
1892
1893 int length = tmp.length();
1894
1895
1896
1897
1898 if (length > 0) {
1899 char[] firstDelimiter = { tmp.charAt(0) };
1900 if (validate(firstDelimiter, delims)) {
1901 if (length >= 2) {
1902 char[] lastDelimiter = { tmp.charAt(length - 1) };
1903 if (validate(lastDelimiter, delims)) {
1904 tmp = tmp.substring(1, length - 1);
1905 length = length - 2;
1906 }
1907 }
1908 }
1909 }
1910
1911
1912
1913
1914 int from = 0;
1915
1916
1917
1918
1919 boolean isStartedFromPath = false;
1920 int atColon = tmp.indexOf(':');
1921 int atSlash = tmp.indexOf('/');
1922 if ((atColon <= 0 && !tmp.startsWith("//"))
1923 || (atSlash >= 0 && atSlash < atColon)) {
1924 isStartedFromPath = true;
1925 }
1926
1927
1928
1929
1930
1931
1932
1933 int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
1934 if (at == -1) {
1935 at = 0;
1936 }
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946 if (at > 0 && at < length && tmp.charAt(at) == ':') {
1947 char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
1948 if (validate(target, scheme)) {
1949 _scheme = target;
1950 } else {
1951 throw new URIException("incorrect scheme");
1952 }
1953 from = ++at;
1954 }
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965 _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
1966 if (0 <= at && at < length && tmp.charAt(at) == '/') {
1967
1968 _is_hier_part = true;
1969 if (at + 2 < length && tmp.charAt(at + 1) == '/'
1970 && !isStartedFromPath) {
1971
1972 int next = indexFirstOf(tmp, "/?#", at + 2);
1973 if (next == -1) {
1974 next = (tmp.substring(at + 2).length() == 0) ? at + 2
1975 : tmp.length();
1976 }
1977 parseAuthority(tmp.substring(at + 2, next), escaped);
1978 from = at = next;
1979
1980 _is_net_path = true;
1981 }
1982 if (from == at) {
1983
1984 _is_abs_path = true;
1985 }
1986 }
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996 if (from < length) {
1997
1998 int next = indexFirstOf(tmp, "?#", from);
1999 if (next == -1) {
2000 next = tmp.length();
2001 }
2002 if (!_is_abs_path) {
2003 if (!escaped
2004 && prevalidate(tmp.substring(from, next), disallowed_rel_path)
2005 || escaped
2006 && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
2007
2008 _is_rel_path = true;
2009 } else if (!escaped
2010 && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
2011 || escaped
2012 && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
2013
2014 _is_opaque_part = true;
2015 } else {
2016
2017 _path = null;
2018 }
2019 }
2020 String s = tmp.substring(from, next);
2021 if (escaped) {
2022 setRawPath(s.toCharArray());
2023 } else {
2024 setPath(s);
2025 }
2026 at = next;
2027 }
2028
2029
2030 String charset = getProtocolCharset();
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040 if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
2041 int next = tmp.indexOf('#', at + 1);
2042 if (next == -1) {
2043 next = tmp.length();
2044 }
2045 if (escaped) {
2046 _query = tmp.substring(at + 1, next).toCharArray();
2047 if (!validate(_query, uric)) {
2048 throw new URIException("Invalid query");
2049 }
2050 } else {
2051 _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
2052 }
2053 at = next;
2054 }
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064 if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
2065 if (at + 1 == length) {
2066 _fragment = "".toCharArray();
2067 } else {
2068 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
2069 : encode(tmp.substring(at + 1), allowed_fragment, charset);
2070 }
2071 }
2072
2073
2074 setURI();
2075 }
2076
2077
2078 /***
2079 * Get the earlier index that to be searched for the first occurrance in
2080 * one of any of the given string.
2081 *
2082 * @param s the string to be indexed
2083 * @param delims the delimiters used to index
2084 * @return the earlier index if there are delimiters
2085 */
2086 protected int indexFirstOf(String s, String delims) {
2087 return indexFirstOf(s, delims, -1);
2088 }
2089
2090
2091 /***
2092 * Get the earlier index that to be searched for the first occurrance in
2093 * one of any of the given string.
2094 *
2095 * @param s the string to be indexed
2096 * @param delims the delimiters used to index
2097 * @param offset the from index
2098 * @return the earlier index if there are delimiters
2099 */
2100 protected int indexFirstOf(String s, String delims, int offset) {
2101 if (s == null || s.length() == 0) {
2102 return -1;
2103 }
2104 if (delims == null || delims.length() == 0) {
2105 return -1;
2106 }
2107
2108 if (offset < 0) {
2109 offset = 0;
2110 } else if (offset > s.length()) {
2111 return -1;
2112 }
2113
2114 int min = s.length();
2115 char[] delim = delims.toCharArray();
2116 for (int i = 0; i < delim.length; i++) {
2117 int at = s.indexOf(delim[i], offset);
2118 if (at >= 0 && at < min) {
2119 min = at;
2120 }
2121 }
2122 return (min == s.length()) ? -1 : min;
2123 }
2124
2125
2126 /***
2127 * Get the earlier index that to be searched for the first occurrance in
2128 * one of any of the given array.
2129 *
2130 * @param s the character array to be indexed
2131 * @param delim the delimiter used to index
2132 * @return the ealier index if there are a delimiter
2133 */
2134 protected int indexFirstOf(char[] s, char delim) {
2135 return indexFirstOf(s, delim, 0);
2136 }
2137
2138
2139 /***
2140 * Get the earlier index that to be searched for the first occurrance in
2141 * one of any of the given array.
2142 *
2143 * @param s the character array to be indexed
2144 * @param delim the delimiter used to index
2145 * @param offset The offset.
2146 * @return the ealier index if there is a delimiter
2147 */
2148 protected int indexFirstOf(char[] s, char delim, int offset) {
2149 if (s == null || s.length == 0) {
2150 return -1;
2151 }
2152
2153 if (offset < 0) {
2154 offset = 0;
2155 } else if (offset > s.length) {
2156 return -1;
2157 }
2158 for (int i = offset; i < s.length; i++) {
2159 if (s[i] == delim) {
2160 return i;
2161 }
2162 }
2163 return -1;
2164 }
2165
2166
2167 /***
2168 * Parse the authority component.
2169 *
2170 * @param original the original character sequence of authority component
2171 * @param escaped <code>true</code> if <code>original</code> is escaped
2172 * @throws URIException If an error occurs.
2173 */
2174 protected void parseAuthority(String original, boolean escaped)
2175 throws URIException {
2176
2177
2178 _is_reg_name = _is_server =
2179 _is_hostname = _is_IPv4address = _is_IPv6reference = false;
2180
2181
2182 String charset = getProtocolCharset();
2183
2184 boolean hasPort = true;
2185 int from = 0;
2186 int next = original.indexOf('@');
2187 if (next != -1) {
2188
2189 _userinfo = (escaped) ? original.substring(0, next).toCharArray()
2190 : encode(original.substring(0, next), allowed_userinfo,
2191 charset);
2192 from = next + 1;
2193 }
2194 next = original.indexOf('[', from);
2195 if (next >= from) {
2196 next = original.indexOf(']', from);
2197 if (next == -1) {
2198 throw new URIException(URIException.PARSING, "IPv6reference");
2199 } else {
2200 next++;
2201 }
2202
2203 _host = (escaped) ? original.substring(from, next).toCharArray()
2204 : encode(original.substring(from, next), allowed_IPv6reference,
2205 charset);
2206
2207 _is_IPv6reference = true;
2208 } else {
2209 next = original.indexOf(':', from);
2210 if (next == -1) {
2211 next = original.length();
2212 hasPort = false;
2213 }
2214
2215 _host = original.substring(from, next).toCharArray();
2216 if (validate(_host, IPv4address)) {
2217
2218 _is_IPv4address = true;
2219 } else if (validate(_host, hostname)) {
2220
2221 _is_hostname = true;
2222 } else {
2223
2224 _is_reg_name = true;
2225 }
2226 }
2227 if (_is_reg_name) {
2228
2229 _is_server = _is_hostname = _is_IPv4address =
2230 _is_IPv6reference = false;
2231
2232 if (escaped) {
2233 _authority = original.toString().toCharArray();
2234 if (!validate(_authority, reg_name)) {
2235 throw new URIException("Invalid authority");
2236 }
2237 } else {
2238 _authority = encode(original.toString(), allowed_reg_name, charset);
2239 }
2240 } else {
2241 if (original.length() - 1 > next && hasPort
2242 && original.charAt(next) == ':') {
2243 from = next + 1;
2244 try {
2245 _port = Integer.parseInt(original.substring(from));
2246 } catch (NumberFormatException error) {
2247 throw new URIException(URIException.PARSING,
2248 "invalid port number");
2249 }
2250 }
2251
2252 StringBuffer buf = new StringBuffer();
2253 if (_userinfo != null) {
2254 buf.append(_userinfo);
2255 buf.append('@');
2256 }
2257 if (_host != null) {
2258 buf.append(_host);
2259 if (_port != -1) {
2260 buf.append(':');
2261 buf.append(_port);
2262 }
2263 }
2264 _authority = buf.toString().toCharArray();
2265
2266 _is_server = true;
2267 }
2268 }
2269
2270
2271 /***
2272 * Once it's parsed successfully, set this URI.
2273 *
2274 * @see #getRawURI
2275 */
2276 protected void setURI() {
2277
2278 StringBuffer buf = new StringBuffer();
2279
2280 if (_scheme != null) {
2281 buf.append(_scheme);
2282 buf.append(':');
2283 }
2284 if (_is_net_path) {
2285 buf.append("//");
2286 if (_authority != null) {
2287 buf.append(_authority);
2288 }
2289 }
2290 if (_opaque != null && _is_opaque_part) {
2291 buf.append(_opaque);
2292 } else if (_path != null) {
2293
2294 if (_path.length != 0) {
2295 buf.append(_path);
2296 }
2297 }
2298 if (_query != null) {
2299 buf.append('?');
2300 buf.append(_query);
2301 }
2302
2303 _uri = buf.toString().toCharArray();
2304 hash = 0;
2305 }
2306
2307
2308
2309
2310 /***
2311 * Tell whether or not this URI is absolute.
2312 *
2313 * @return true iif this URI is absoluteURI
2314 */
2315 public boolean isAbsoluteURI() {
2316 return (_scheme != null);
2317 }
2318
2319
2320 /***
2321 * Tell whether or not this URI is relative.
2322 *
2323 * @return true iif this URI is relativeURI
2324 */
2325 public boolean isRelativeURI() {
2326 return (_scheme == null);
2327 }
2328
2329
2330 /***
2331 * Tell whether or not the absoluteURI of this URI is hier_part.
2332 *
2333 * @return true iif the absoluteURI is hier_part
2334 */
2335 public boolean isHierPart() {
2336 return _is_hier_part;
2337 }
2338
2339
2340 /***
2341 * Tell whether or not the absoluteURI of this URI is opaque_part.
2342 *
2343 * @return true iif the absoluteURI is opaque_part
2344 */
2345 public boolean isOpaquePart() {
2346 return _is_opaque_part;
2347 }
2348
2349
2350 /***
2351 * Tell whether or not the relativeURI or heir_part of this URI is net_path.
2352 * It's the same function as the has_authority() method.
2353 *
2354 * @return true iif the relativeURI or heir_part is net_path
2355 * @see #hasAuthority
2356 */
2357 public boolean isNetPath() {
2358 return _is_net_path || (_authority != null);
2359 }
2360
2361
2362 /***
2363 * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
2364 *
2365 * @return true iif the relativeURI or hier_part is abs_path
2366 */
2367 public boolean isAbsPath() {
2368 return _is_abs_path;
2369 }
2370
2371
2372 /***
2373 * Tell whether or not the relativeURI of this URI is rel_path.
2374 *
2375 * @return true iif the relativeURI is rel_path
2376 */
2377 public boolean isRelPath() {
2378 return _is_rel_path;
2379 }
2380
2381
2382 /***
2383 * Tell whether or not this URI has authority.
2384 * It's the same function as the is_net_path() method.
2385 *
2386 * @return true iif this URI has authority
2387 * @see #isNetPath
2388 */
2389 public boolean hasAuthority() {
2390 return (_authority != null) || _is_net_path;
2391 }
2392
2393 /***
2394 * Tell whether or not the authority component of this URI is reg_name.
2395 *
2396 * @return true iif the authority component is reg_name
2397 */
2398 public boolean isRegName() {
2399 return _is_reg_name;
2400 }
2401
2402
2403 /***
2404 * Tell whether or not the authority component of this URI is server.
2405 *
2406 * @return true iif the authority component is server
2407 */
2408 public boolean isServer() {
2409 return _is_server;
2410 }
2411
2412
2413 /***
2414 * Tell whether or not this URI has userinfo.
2415 *
2416 * @return true iif this URI has userinfo
2417 */
2418 public boolean hasUserinfo() {
2419 return (_userinfo != null);
2420 }
2421
2422
2423 /***
2424 * Tell whether or not the host part of this URI is hostname.
2425 *
2426 * @return true iif the host part is hostname
2427 */
2428 public boolean isHostname() {
2429 return _is_hostname;
2430 }
2431
2432
2433 /***
2434 * Tell whether or not the host part of this URI is IPv4address.
2435 *
2436 * @return true iif the host part is IPv4address
2437 */
2438 public boolean isIPv4address() {
2439 return _is_IPv4address;
2440 }
2441
2442
2443 /***
2444 * Tell whether or not the host part of this URI is IPv6reference.
2445 *
2446 * @return true iif the host part is IPv6reference
2447 */
2448 public boolean isIPv6reference() {
2449 return _is_IPv6reference;
2450 }
2451
2452
2453 /***
2454 * Tell whether or not this URI has query.
2455 *
2456 * @return true iif this URI has query
2457 */
2458 public boolean hasQuery() {
2459 return (_query != null);
2460 }
2461
2462
2463 /***
2464 * Tell whether or not this URI has fragment.
2465 *
2466 * @return true iif this URI has fragment
2467 */
2468 public boolean hasFragment() {
2469 return (_fragment != null);
2470 }
2471
2472
2473
2474
2475
2476 /***
2477 * Set the default charset of the protocol.
2478 * <p>
2479 * The character set used to store files SHALL remain a local decision and
2480 * MAY depend on the capability of local operating systems. Prior to the
2481 * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
2482 * and UTF-8 encoded. This approach, while allowing international exchange
2483 * of URIs, will still allow backward compatibility with older systems
2484 * because the code set positions for ASCII characters are identical to the
2485 * one byte sequence in UTF-8.
2486 * <p>
2487 * An individual URI scheme may require a single charset, define a default
2488 * charset, or provide a way to indicate the charset used.
2489 *
2490 * <p>
2491 * Always all the time, the setter method is always succeeded and throws
2492 * <code>DefaultCharsetChanged</code> exception.
2493 *
2494 * So API programmer must follow the following way:
2495 * <code><pre>
2496 * import org.apache.util.URI$DefaultCharsetChanged;
2497 * .
2498 * .
2499 * .
2500 * try {
2501 * URI.setDefaultProtocolCharset("UTF-8");
2502 * } catch (DefaultCharsetChanged cc) {
2503 * // CASE 1: the exception could be ignored, when it is set by user
2504 * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
2505 * // CASE 2: let user know the default protocol charset changed
2506 * } else {
2507 * // CASE 2: let user know the default document charset changed
2508 * }
2509 * }
2510 * </pre></code>
2511 *
2512 * The API programmer is responsible to set the correct charset.
2513 * And each application should remember its own charset to support.
2514 *
2515 * @param charset the default charset for each protocol
2516 * @throws DefaultCharsetChanged default charset changed
2517 */
2518 public static void setDefaultProtocolCharset(String charset)
2519 throws DefaultCharsetChanged {
2520
2521 defaultProtocolCharset = charset;
2522 throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET,
2523 "the default protocol charset changed");
2524 }
2525
2526
2527 /***
2528 * Get the default charset of the protocol.
2529 * <p>
2530 * An individual URI scheme may require a single charset, define a default
2531 * charset, or provide a way to indicate the charset used.
2532 * <p>
2533 * To work globally either requires support of a number of character sets
2534 * and to be able to convert between them, or the use of a single preferred
2535 * character set.
2536 * For support of global compatibility it is STRONGLY RECOMMENDED that
2537 * clients and servers use UTF-8 encoding when exchanging URIs.
2538 *
2539 * @return the default charset string
2540 */
2541 public static String getDefaultProtocolCharset() {
2542 return defaultProtocolCharset;
2543 }
2544
2545
2546 /***
2547 * Get the protocol charset used by this current URI instance.
2548 * It was set by the constructor for this instance. If it was not set by
2549 * contructor, it will return the default protocol charset.
2550 *
2551 * @return the protocol charset string
2552 * @see #getDefaultProtocolCharset
2553 */
2554 public String getProtocolCharset() {
2555 return (protocolCharset != null)
2556 ? protocolCharset
2557 : defaultProtocolCharset;
2558 }
2559
2560
2561 /***
2562 * Set the default charset of the document.
2563 * <p>
2564 * Notice that it will be possible to contain mixed characters (e.g.
2565 * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
2566 * display of these character sets, the protocol charset could be simply
2567 * used again. Because it's not yet implemented that the insertion of BIDI
2568 * control characters at different points during composition is extracted.
2569 * <p>
2570 *
2571 * Always all the time, the setter method is always succeeded and throws
2572 * <code>DefaultCharsetChanged</code> exception.
2573 *
2574 * So API programmer must follow the following way:
2575 * <code><pre>
2576 * import org.apache.util.URI$DefaultCharsetChanged;
2577 * .
2578 * .
2579 * .
2580 * try {
2581 * URI.setDefaultDocumentCharset("EUC-KR");
2582 * } catch (DefaultCharsetChanged cc) {
2583 * // CASE 1: the exception could be ignored, when it is set by user
2584 * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
2585 * // CASE 2: let user know the default document charset changed
2586 * } else {
2587 * // CASE 2: let user know the default protocol charset changed
2588 * }
2589 * }
2590 * </pre></code>
2591 *
2592 * The API programmer is responsible to set the correct charset.
2593 * And each application should remember its own charset to support.
2594 *
2595 * @param charset the default charset for the document
2596 * @throws DefaultCharsetChanged default charset changed
2597 */
2598 public static void setDefaultDocumentCharset(String charset)
2599 throws DefaultCharsetChanged {
2600
2601 defaultDocumentCharset = charset;
2602 throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET,
2603 "the default document charset changed");
2604 }
2605
2606
2607 /***
2608 * Get the recommended default charset of the document.
2609 *
2610 * @return the default charset string
2611 */
2612 public static String getDefaultDocumentCharset() {
2613 return defaultDocumentCharset;
2614 }
2615
2616
2617 /***
2618 * Get the default charset of the document by locale.
2619 *
2620 * @return the default charset string by locale
2621 */
2622 public static String getDefaultDocumentCharsetByLocale() {
2623 return defaultDocumentCharsetByLocale;
2624 }
2625
2626
2627 /***
2628 * Get the default charset of the document by platform.
2629 *
2630 * @return the default charset string by platform
2631 */
2632 public static String getDefaultDocumentCharsetByPlatform() {
2633 return defaultDocumentCharsetByPlatform;
2634 }
2635
2636
2637
2638 /***
2639 * Get the scheme.
2640 *
2641 * @return the scheme
2642 */
2643 public char[] getRawScheme() {
2644 return _scheme;
2645 }
2646
2647
2648 /***
2649 * Get the scheme.
2650 *
2651 * @return the scheme
2652 * null if undefined scheme
2653 */
2654 public String getScheme() {
2655 return (_scheme == null) ? null : new String(_scheme);
2656 }
2657
2658
2659
2660 /***
2661 * Set the authority. It can be one type of server, hostport, hostname,
2662 * IPv4address, IPv6reference and reg_name.
2663 * <p><blockquote><pre>
2664 * authority = server | reg_name
2665 * </pre></blockquote><p>
2666 *
2667 * @param escapedAuthority the raw escaped authority
2668 * @throws URIException If {@link
2669 * #parseAuthority(java.lang.String,boolean)} fails
2670 * @throws NullPointerException null authority
2671 */
2672 public void setRawAuthority(char[] escapedAuthority)
2673 throws URIException, NullPointerException {
2674
2675 parseAuthority(new String(escapedAuthority), true);
2676 setURI();
2677 }
2678
2679
2680 /***
2681 * Set the authority. It can be one type of server, hostport, hostname,
2682 * IPv4address, IPv6reference and reg_name.
2683 * Note that there is no setAuthority method by the escape encoding reason.
2684 *
2685 * @param escapedAuthority the escaped authority string
2686 * @throws URIException If {@link
2687 * #parseAuthority(java.lang.String,boolean)} fails
2688 */
2689 public void setEscapedAuthority(String escapedAuthority)
2690 throws URIException {
2691
2692 parseAuthority(escapedAuthority, true);
2693 setURI();
2694 }
2695
2696
2697 /***
2698 * Get the raw-escaped authority.
2699 *
2700 * @return the raw-escaped authority
2701 */
2702 public char[] getRawAuthority() {
2703 return _authority;
2704 }
2705
2706
2707 /***
2708 * Get the escaped authority.
2709 *
2710 * @return the escaped authority
2711 */
2712 public String getEscapedAuthority() {
2713 return (_authority == null) ? null : new String(_authority);
2714 }
2715
2716
2717 /***
2718 * Get the authority.
2719 *
2720 * @return the authority
2721 * @throws URIException If {@link #decode} fails
2722 */
2723 public String getAuthority() throws URIException {
2724 return (_authority == null) ? null : decode(_authority,
2725 getProtocolCharset());
2726 }
2727
2728
2729
2730 /***
2731 * Get the raw-escaped userinfo.
2732 *
2733 * @return the raw-escaped userinfo
2734 * @see #getAuthority
2735 */
2736 public char[] getRawUserinfo() {
2737 return _userinfo;
2738 }
2739
2740
2741 /***
2742 * Get the escaped userinfo.
2743 *
2744 * @return the escaped userinfo
2745 * @see #getAuthority
2746 */
2747 public String getEscapedUserinfo() {
2748 return (_userinfo == null) ? null : new String(_userinfo);
2749 }
2750
2751
2752 /***
2753 * Get the userinfo.
2754 *
2755 * @return the userinfo
2756 * @throws URIException If {@link #decode} fails
2757 * @see #getAuthority
2758 */
2759 public String getUserinfo() throws URIException {
2760 return (_userinfo == null) ? null : decode(_userinfo,
2761 getProtocolCharset());
2762 }
2763
2764
2765
2766 /***
2767 * Get the host.
2768 * <p><blockquote><pre>
2769 * host = hostname | IPv4address | IPv6reference
2770 * </pre></blockquote><p>
2771 *
2772 * @return the host
2773 * @see #getAuthority
2774 */
2775 public char[] getRawHost() {
2776 return _host;
2777 }
2778
2779
2780 /***
2781 * Get the host.
2782 * <p><blockquote><pre>
2783 * host = hostname | IPv4address | IPv6reference
2784 * </pre></blockquote><p>
2785 *
2786 * @return the host
2787 * @throws URIException If {@link #decode} fails
2788 * @see #getAuthority
2789 */
2790 public String getHost() throws URIException {
2791 if (_host != null) {
2792 return decode(_host, getProtocolCharset());
2793 } else {
2794 return null;
2795 }
2796 }
2797
2798
2799
2800 /***
2801 * Get the port. In order to get the specfic default port, the specific
2802 * protocol-supported class extended from the URI class should be used.
2803 * It has the server-based naming authority.
2804 *
2805 * @return the port
2806 * if -1, it has the default port for the scheme or the server-based
2807 * naming authority is not supported in the specific URI.
2808 */
2809 public int getPort() {
2810 return _port;
2811 }
2812
2813
2814
2815 /***
2816 * Set the raw-escaped path.
2817 *
2818 * @param escapedPath the path character sequence
2819 * @throws URIException encoding error or not proper for initial instance
2820 * @see #encode
2821 */
2822 public void setRawPath(char[] escapedPath) throws URIException {
2823 if (escapedPath == null || escapedPath.length == 0) {
2824 _path = _opaque = escapedPath;
2825 setURI();
2826 return;
2827 }
2828
2829 escapedPath = removeFragmentIdentifier(escapedPath);
2830 if (_is_net_path || _is_abs_path) {
2831 if (escapedPath[0] != '/') {
2832 throw new URIException(URIException.PARSING,
2833 "not absolute path");
2834 }
2835 if (!validate(escapedPath, abs_path)) {
2836 throw new URIException(URIException.ESCAPING,
2837 "escaped absolute path not valid");
2838 }
2839 _path = escapedPath;
2840 } else if (_is_rel_path) {
2841 int at = indexFirstOf(escapedPath, '/');
2842 if (at == 0) {
2843 throw new URIException(URIException.PARSING, "incorrect path");
2844 }
2845 if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment)
2846 && !validate(escapedPath, at, -1, abs_path)
2847 || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) {
2848
2849 throw new URIException(URIException.ESCAPING,
2850 "escaped relative path not valid");
2851 }
2852 _path = escapedPath;
2853 } else if (_is_opaque_part) {
2854 if (!uric_no_slash.get(escapedPath[0])
2855 && !validate(escapedPath, 1, -1, uric)) {
2856 throw new URIException(URIException.ESCAPING,
2857 "escaped opaque part not valid");
2858 }
2859 _opaque = escapedPath;
2860 } else {
2861 throw new URIException(URIException.PARSING, "incorrect path");
2862 }
2863 setURI();
2864 }
2865
2866
2867 /***
2868 * Set the escaped path.
2869 *
2870 * @param escapedPath the escaped path string
2871 * @throws URIException encoding error or not proper for initial instance
2872 * @see #encode
2873 */
2874 public void setEscapedPath(String escapedPath) throws URIException {
2875 if (escapedPath == null) {
2876 _path = _opaque = null;
2877 setURI();
2878 return;
2879 }
2880 setRawPath(escapedPath.toCharArray());
2881 }
2882
2883
2884 /***
2885 * Set the path.
2886 *
2887 * @param path the path string
2888 * @throws URIException set incorrectly or fragment only
2889 * @see #encode
2890 */
2891 public void setPath(String path) throws URIException {
2892
2893 if (path == null || path.length() == 0) {
2894 _path = _opaque = (path == null) ? null : path.toCharArray();
2895 setURI();
2896 return;
2897 }
2898
2899 String charset = getProtocolCharset();
2900
2901 if (_is_net_path || _is_abs_path) {
2902 _path = encode(path, allowed_abs_path, charset);
2903 } else if (_is_rel_path) {
2904 StringBuffer buff = new StringBuffer(path.length());
2905 int at = path.indexOf('/');
2906 if (at == 0) {
2907 throw new URIException(URIException.PARSING,
2908 "incorrect relative path");
2909 }
2910 if (at > 0) {
2911 buff.append(encode(path.substring(0, at), allowed_rel_path,
2912 charset));
2913 buff.append(encode(path.substring(at), allowed_abs_path,
2914 charset));
2915 } else {
2916 buff.append(encode(path, allowed_rel_path, charset));
2917 }
2918 _path = buff.toString().toCharArray();
2919 } else if (_is_opaque_part) {
2920 StringBuffer buf = new StringBuffer();
2921 buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset));
2922 buf.insert(1, encode(path.substring(1), uric, charset));
2923 _opaque = buf.toString().toCharArray();
2924 } else {
2925 throw new URIException(URIException.PARSING, "incorrect path");
2926 }
2927 setURI();
2928 }
2929
2930
2931 /***
2932 * Resolve the base and relative path.
2933 *
2934 * @param basePath a character array of the basePath
2935 * @param relPath a character array of the relPath
2936 * @return the resolved path
2937 * @throws URIException no more higher path level to be resolved
2938 */
2939 protected char[] resolvePath(char[] basePath, char[] relPath)
2940 throws URIException {
2941
2942
2943 String base = (basePath == null) ? "" : new String(basePath);
2944 int at = base.lastIndexOf('/');
2945 if (at != -1) {
2946 basePath = base.substring(0, at + 1).toCharArray();
2947 }
2948
2949 if (relPath == null || relPath.length == 0) {
2950 return normalize(basePath);
2951 } else if (relPath[0] == '/') {
2952 return normalize(relPath);
2953 } else {
2954 StringBuffer buff = new StringBuffer(base.length()
2955 + relPath.length);
2956 buff.append((at != -1) ? base.substring(0, at + 1) : "/");
2957 buff.append(relPath);
2958 return normalize(buff.toString().toCharArray());
2959 }
2960 }
2961
2962
2963 /***
2964 * Get the raw-escaped current hierarchy level in the given path.
2965 * If the last namespace is a collection, the slash mark ('/') should be
2966 * ended with at the last character of the path string.
2967 *
2968 * @param path the path
2969 * @return the current hierarchy level
2970 * @throws URIException no hierarchy level
2971 */
2972 protected char[] getRawCurrentHierPath(char[] path) throws URIException {
2973
2974 if (_is_opaque_part) {
2975 throw new URIException(URIException.PARSING, "no hierarchy level");
2976 }
2977 if (path == null) {
2978 throw new URIException(URIException.PARSING, "empty path");
2979 }
2980 String buff = new String(path);
2981 int first = buff.indexOf('/');
2982 int last = buff.lastIndexOf('/');
2983 if (last == 0) {
2984 return rootPath;
2985 } else if (first != last && last != -1) {
2986 return buff.substring(0, last).toCharArray();
2987 }
2988
2989 return path;
2990 }
2991
2992
2993 /***
2994 * Get the raw-escaped current hierarchy level.
2995 *
2996 * @return the raw-escaped current hierarchy level
2997 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2998 */
2999 public char[] getRawCurrentHierPath() throws URIException {
3000 return (_path == null) ? null : getRawCurrentHierPath(_path);
3001 }
3002
3003
3004 /***
3005 * Get the escaped current hierarchy level.
3006 *
3007 * @return the escaped current hierarchy level
3008 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3009 */
3010 public String getEscapedCurrentHierPath() throws URIException {
3011 char[] path = getRawCurrentHierPath();
3012 return (path == null) ? null : new String(path);
3013 }
3014
3015
3016 /***
3017 * Get the current hierarchy level.
3018 *
3019 * @return the current hierarchy level
3020 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3021 * @see #decode
3022 */
3023 public String getCurrentHierPath() throws URIException {
3024 char[] path = getRawCurrentHierPath();
3025 return (path == null) ? null : decode(path, getProtocolCharset());
3026 }
3027
3028
3029 /***
3030 * Get the level above the this hierarchy level.
3031 *
3032 * @return the raw above hierarchy level
3033 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3034 */
3035 public char[] getRawAboveHierPath() throws URIException {
3036 char[] path = getRawCurrentHierPath();
3037 return (path == null) ? null : getRawCurrentHierPath(path);
3038 }
3039
3040
3041 /***
3042 * Get the level above the this hierarchy level.
3043 *
3044 * @return the raw above hierarchy level
3045 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3046 */
3047 public String getEscapedAboveHierPath() throws URIException {
3048 char[] path = getRawAboveHierPath();
3049 return (path == null) ? null : new String(path);
3050 }
3051
3052
3053 /***
3054 * Get the level above the this hierarchy level.
3055 *
3056 * @return the above hierarchy level
3057 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3058 * @see #decode
3059 */
3060 public String getAboveHierPath() throws URIException {
3061 char[] path = getRawAboveHierPath();
3062 return (path == null) ? null : decode(path, getProtocolCharset());
3063 }
3064
3065
3066 /***
3067 * Get the raw-escaped path.
3068 * <p><blockquote><pre>
3069 * path = [ abs_path | opaque_part ]
3070 * </pre></blockquote><p>
3071 *
3072 * @return the raw-escaped path
3073 */
3074 public char[] getRawPath() {
3075 return _is_opaque_part ? _opaque : _path;
3076 }
3077
3078
3079 /***
3080 * Get the escaped path.
3081 * <p><blockquote><pre>
3082 * path = [ abs_path | opaque_part ]
3083 * abs_path = "/" path_segments
3084 * opaque_part = uric_no_slash *uric
3085 * </pre></blockquote><p>
3086 *
3087 * @return the escaped path string
3088 */
3089 public String getEscapedPath() {
3090 char[] path = getRawPath();
3091 return (path == null) ? null : new String(path);
3092 }
3093
3094
3095 /***
3096 * Get the path.
3097 * <p><blockquote><pre>
3098 * path = [ abs_path | opaque_part ]
3099 * </pre></blockquote><p>
3100 * @return the path string
3101 * @throws URIException If {@link #decode} fails.
3102 * @see #decode
3103 */
3104 public String getPath() throws URIException {
3105 char[] path = getRawPath();
3106 return (path == null) ? null : decode(path, getProtocolCharset());
3107 }
3108
3109
3110 /***
3111 * Get the raw-escaped basename of the path.
3112 *
3113 * @return the raw-escaped basename
3114 */
3115 public char[] getRawName() {
3116 if (_path == null) {
3117 return null;
3118 }
3119
3120 int at = 0;
3121 for (int i = _path.length - 1; i >= 0; i--) {
3122 if (_path[i] == '/') {
3123 at = i + 1;
3124 break;
3125 }
3126 }
3127 int len = _path.length - at;
3128 char[] basename = new char[len];
3129 System.arraycopy(_path, at, basename, 0, len);
3130 return basename;
3131 }
3132
3133
3134 /***
3135 * Get the escaped basename of the path.
3136 *
3137 * @return the escaped basename string
3138 */
3139 public String getEscapedName() {
3140 char[] basename = getRawName();
3141 return (basename == null) ? null : new String(basename);
3142 }
3143
3144
3145 /***
3146 * Get the basename of the path.
3147 *
3148 * @return the basename string
3149 * @throws URIException incomplete trailing escape pattern or unsupported
3150 * character encoding
3151 * @see #decode
3152 */
3153 public String getName() throws URIException {
3154 char[] basename = getRawName();
3155 return (basename == null) ? null : decode(getRawName(),
3156 getProtocolCharset());
3157 }
3158
3159
3160
3161 /***
3162 * Get the raw-escaped path and query.
3163 *
3164 * @return the raw-escaped path and query
3165 */
3166 public char[] getRawPathQuery() {
3167
3168 if (_path == null && _query == null) {
3169 return null;
3170 }
3171 StringBuffer buff = new StringBuffer();
3172 if (_path != null) {
3173 buff.append(_path);
3174 }
3175 if (_query != null) {
3176 buff.append('?');
3177 buff.append(_query);
3178 }
3179 return buff.toString().toCharArray();
3180 }
3181
3182
3183 /***
3184 * Get the escaped query.
3185 *
3186 * @return the escaped path and query string
3187 */
3188 public String getEscapedPathQuery() {
3189 char[] rawPathQuery = getRawPathQuery();
3190 return (rawPathQuery == null) ? null : new String(rawPathQuery);
3191 }
3192
3193
3194 /***
3195 * Get the path and query.
3196 *
3197 * @return the path and query string.
3198 * @throws URIException incomplete trailing escape pattern or unsupported
3199 * character encoding
3200 * @see #decode
3201 */
3202 public String getPathQuery() throws URIException {
3203 char[] rawPathQuery = getRawPathQuery();
3204 return (rawPathQuery == null) ? null : decode(rawPathQuery,
3205 getProtocolCharset());
3206 }
3207
3208
3209
3210 /***
3211 * Set the raw-escaped query.
3212 *
3213 * @param escapedQuery the raw-escaped query
3214 * @throws URIException escaped query not valid
3215 */
3216 public void setRawQuery(char[] escapedQuery) throws URIException {
3217 if (escapedQuery == null || escapedQuery.length == 0) {
3218 _query = escapedQuery;
3219 setURI();
3220 return;
3221 }
3222
3223 escapedQuery = removeFragmentIdentifier(escapedQuery);
3224 if (!validate(escapedQuery, query)) {
3225 throw new URIException(URIException.ESCAPING,
3226 "escaped query not valid");
3227 }
3228 _query = escapedQuery;
3229 setURI();
3230 }
3231
3232
3233 /***
3234 * Set the escaped query string.
3235 *
3236 * @param escapedQuery the escaped query string
3237 * @throws URIException escaped query not valid
3238 */
3239 public void setEscapedQuery(String escapedQuery) throws URIException {
3240 if (escapedQuery == null) {
3241 _query = null;
3242 setURI();
3243 return;
3244 }
3245 setRawQuery(escapedQuery.toCharArray());
3246 }
3247
3248
3249 /***
3250 * Set the query.
3251 * <p>
3252 * When a query string is not misunderstood the reserved special characters
3253 * ("&", "=", "+", ",", and "$") within a query component, it is
3254 * recommended to use in encoding the whole query with this method.
3255 * <p>
3256 * The additional APIs for the special purpose using by the reserved
3257 * special characters used in each protocol are implemented in each protocol
3258 * classes inherited from <code>URI</code>. So refer to the same-named APIs
3259 * implemented in each specific protocol instance.
3260 *
3261 * @param query the query string.
3262 * @throws URIException incomplete trailing escape pattern or unsupported
3263 * character encoding
3264 * @see #encode
3265 */
3266 public void setQuery(String query) throws URIException {
3267 if (query == null || query.length() == 0) {
3268 _query = (query == null) ? null : query.toCharArray();
3269 setURI();
3270 return;
3271 }
3272 setRawQuery(encode(query, allowed_query, getProtocolCharset()));
3273 }
3274
3275
3276 /***
3277 * Get the raw-escaped query.
3278 *
3279 * @return the raw-escaped query
3280 */
3281 public char[] getRawQuery() {
3282 return _query;
3283 }
3284
3285
3286 /***
3287 * Get the escaped query.
3288 *
3289 * @return the escaped query string
3290 */
3291 public String getEscapedQuery() {
3292 return (_query == null) ? null : new String(_query);
3293 }
3294
3295
3296 /***
3297 * Get the query.
3298 *
3299 * @return the query string.
3300 * @throws URIException incomplete trailing escape pattern or unsupported
3301 * character encoding
3302 * @see #decode
3303 */
3304 public String getQuery() throws URIException {
3305 return (_query == null) ? null : decode(_query, getProtocolCharset());
3306 }
3307
3308
3309
3310 /***
3311 * Set the raw-escaped fragment.
3312 *
3313 * @param escapedFragment the raw-escaped fragment
3314 * @throws URIException escaped fragment not valid
3315 */
3316 public void setRawFragment(char[] escapedFragment) throws URIException {
3317 if (escapedFragment == null || escapedFragment.length == 0) {
3318 _fragment = escapedFragment;
3319 hash = 0;
3320 return;
3321 }
3322 if (!validate(escapedFragment, fragment)) {
3323 throw new URIException(URIException.ESCAPING,
3324 "escaped fragment not valid");
3325 }
3326 _fragment = escapedFragment;
3327 hash = 0;
3328 }
3329
3330
3331 /***
3332 * Set the escaped fragment string.
3333 *
3334 * @param escapedFragment the escaped fragment string
3335 * @throws URIException escaped fragment not valid
3336 */
3337 public void setEscapedFragment(String escapedFragment) throws URIException {
3338 if (escapedFragment == null) {
3339 _fragment = null;
3340 hash = 0;
3341 return;
3342 }
3343 setRawFragment(escapedFragment.toCharArray());
3344 }
3345
3346
3347 /***
3348 * Set the fragment.
3349 *
3350 * @param fragment the fragment string.
3351 * @throws URIException If an error occurs.
3352 */
3353 public void setFragment(String fragment) throws URIException {
3354 if (fragment == null || fragment.length() == 0) {
3355 _fragment = (fragment == null) ? null : fragment.toCharArray();
3356 hash = 0;
3357 return;
3358 }
3359 _fragment = encode(fragment, allowed_fragment, getProtocolCharset());
3360 hash = 0;
3361 }
3362
3363
3364 /***
3365 * Get the raw-escaped fragment.
3366 * <p>
3367 * The optional fragment identifier is not part of a URI, but is often used
3368 * in conjunction with a URI.
3369 * <p>
3370 * The format and interpretation of fragment identifiers is dependent on
3371 * the media type [RFC2046] of the retrieval result.
3372 * <p>
3373 * A fragment identifier is only meaningful when a URI reference is
3374 * intended for retrieval and the result of that retrieval is a document
3375 * for which the identified fragment is consistently defined.
3376 *
3377 * @return the raw-escaped fragment
3378 */
3379 public char[] getRawFragment() {
3380 return _fragment;
3381 }
3382
3383
3384 /***
3385 * Get the escaped fragment.
3386 *
3387 * @return the escaped fragment string
3388 */
3389 public String getEscapedFragment() {
3390 return (_fragment == null) ? null : new String(_fragment);
3391 }
3392
3393
3394 /***
3395 * Get the fragment.
3396 *
3397 * @return the fragment string
3398 * @throws URIException incomplete trailing escape pattern or unsupported
3399 * character encoding
3400 * @see #decode
3401 */
3402 public String getFragment() throws URIException {
3403 return (_fragment == null) ? null : decode(_fragment,
3404 getProtocolCharset());
3405 }
3406
3407
3408
3409 /***
3410 * Remove the fragment identifier of the given component.
3411 *
3412 * @param component the component that a fragment may be included
3413 * @return the component that the fragment identifier is removed
3414 */
3415 protected char[] removeFragmentIdentifier(char[] component) {
3416 if (component == null) {
3417 return null;
3418 }
3419 int lastIndex = new String(component).indexOf('#');
3420 if (lastIndex != -1) {
3421 component = new String(component).substring(0,
3422 lastIndex).toCharArray();
3423 }
3424 return component;
3425 }
3426
3427
3428 /***
3429 * Normalize the given hier path part.
3430 *
3431 * <p>Algorithm taken from URI reference parser at
3432 * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
3433 *
3434 * @param path the path to normalize
3435 * @return the normalized path
3436 * @throws URIException no more higher path level to be normalized
3437 */
3438 protected char[] normalize(char[] path) throws URIException {
3439
3440 if (path == null) {
3441 return null;
3442 }
3443
3444 String normalized = new String(path);
3445
3446
3447 if (normalized.startsWith("./")) {
3448 normalized = normalized.substring(1);
3449 } else if (normalized.startsWith("../")) {
3450 normalized = normalized.substring(2);
3451 } else if (normalized.startsWith("..")) {
3452 normalized = normalized.substring(2);
3453 }
3454
3455
3456 int index = -1;
3457 while ((index = normalized.indexOf("/./")) != -1) {
3458 normalized = normalized.substring(0, index) + normalized.substring(index + 2);
3459 }
3460
3461
3462 if (normalized.endsWith("/.")) {
3463 normalized = normalized.substring(0, normalized.length() - 1);
3464 }
3465
3466 int startIndex = 0;
3467
3468
3469
3470
3471
3472
3473 while ((index = normalized.indexOf("/../", startIndex)) != -1) {
3474 int slashIndex = normalized.lastIndexOf('/', index - 1);
3475 if (slashIndex >= 0) {
3476 normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3);
3477 } else {
3478 startIndex = index + 3;
3479 }
3480 }
3481 if (normalized.endsWith("/..")) {
3482 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3483 if (slashIndex >= 0) {
3484 normalized = normalized.substring(0, slashIndex + 1);
3485 }
3486 }
3487
3488
3489
3490
3491
3492
3493 while ((index = normalized.indexOf("/../")) != -1) {
3494 int slashIndex = normalized.lastIndexOf('/', index - 1);
3495 if (slashIndex >= 0) {
3496 break;
3497 } else {
3498 normalized = normalized.substring(index + 3);
3499 }
3500 }
3501 if (normalized.endsWith("/..")) {
3502 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3503 if (slashIndex < 0) {
3504 normalized = "/";
3505 }
3506 }
3507
3508 return normalized.toCharArray();
3509 }
3510
3511
3512 /***
3513 * Normalizes the path part of this URI. Normalization is only meant to be performed on
3514 * URIs with an absolute path. Calling this method on a relative path URI will have no
3515 * effect.
3516 *
3517 * @throws URIException no more higher path level to be normalized
3518 *
3519 * @see #isAbsPath()
3520 */
3521 public void normalize() throws URIException {
3522 if (isAbsPath()) {
3523 _path = normalize(_path);
3524 setURI();
3525 }
3526 }
3527
3528
3529 /***
3530 * Test if the first array is equal to the second array.
3531 *
3532 * @param first the first character array
3533 * @param second the second character array
3534 * @return true if they're equal
3535 */
3536 protected boolean equals(char[] first, char[] second) {
3537
3538 if (first == null && second == null) {
3539 return true;
3540 }
3541 if (first == null || second == null) {
3542 return false;
3543 }
3544 if (first.length != second.length) {
3545 return false;
3546 }
3547 for (int i = 0; i < first.length; i++) {
3548 if (first[i] != second[i]) {
3549 return false;
3550 }
3551 }
3552 return true;
3553 }
3554
3555
3556 /***
3557 * Test an object if this URI is equal to another.
3558 *
3559 * @param obj an object to compare
3560 * @return true if two URI objects are equal
3561 */
3562 public boolean equals(Object obj) {
3563
3564
3565 if (obj == this) {
3566 return true;
3567 }
3568 if (!(obj instanceof URI)) {
3569 return false;
3570 }
3571 URI another = (URI) obj;
3572
3573 if (!equals(_scheme, another._scheme)) {
3574 return false;
3575 }
3576
3577 if (!equals(_opaque, another._opaque)) {
3578 return false;
3579 }
3580
3581
3582 if (!equals(_authority, another._authority)) {
3583 return false;
3584 }
3585
3586 if (!equals(_path, another._path)) {
3587 return false;
3588 }
3589
3590 if (!equals(_query, another._query)) {
3591 return false;
3592 }
3593
3594 if (!equals(_fragment, another._fragment)) {
3595 return false;
3596 }
3597 return true;
3598 }
3599
3600
3601
3602 /***
3603 * Write the content of this URI.
3604 *
3605 * @param oos the object-output stream
3606 * @throws IOException If an IO problem occurs.
3607 */
3608 protected void writeObject(ObjectOutputStream oos)
3609 throws IOException {
3610
3611 oos.defaultWriteObject();
3612 }
3613
3614
3615 /***
3616 * Read a URI.
3617 *
3618 * @param ois the object-input stream
3619 * @throws ClassNotFoundException If one of the classes specified in the
3620 * input stream cannot be found.
3621 * @throws IOException If an IO problem occurs.
3622 */
3623 protected void readObject(ObjectInputStream ois)
3624 throws ClassNotFoundException, IOException {
3625
3626 ois.defaultReadObject();
3627 }
3628
3629
3630
3631 /***
3632 * Return a hash code for this URI.
3633 *
3634 * @return a has code value for this URI
3635 */
3636 public int hashCode() {
3637 if (hash == 0) {
3638 char[] c = _uri;
3639 if (c != null) {
3640 for (int i = 0, len = c.length; i < len; i++) {
3641 hash = 31 * hash + c[i];
3642 }
3643 }
3644 c = _fragment;
3645 if (c != null) {
3646 for (int i = 0, len = c.length; i < len; i++) {
3647 hash = 31 * hash + c[i];
3648 }
3649 }
3650 }
3651 return hash;
3652 }
3653
3654
3655
3656 /***
3657 * Compare this URI to another object.
3658 *
3659 * @param obj the object to be compared.
3660 * @return 0, if it's same,
3661 * -1, if failed, first being compared with in the authority component
3662 * @throws ClassCastException not URI argument
3663 */
3664 public int compareTo(Object obj) throws ClassCastException {
3665
3666 URI another = (URI) obj;
3667 if (!equals(_authority, another.getRawAuthority())) {
3668 return -1;
3669 }
3670 return toString().compareTo(another.toString());
3671 }
3672
3673
3674
3675 /***
3676 * Create and return a copy of this object, the URI-reference containing
3677 * the userinfo component. Notice that the whole URI-reference including
3678 * the userinfo component counld not be gotten as a <code>String</code>.
3679 * <p>
3680 * To copy the identical <code>URI</code> object including the userinfo
3681 * component, it should be used.
3682 *
3683 * @return a clone of this instance
3684 */
3685 public synchronized Object clone() {
3686
3687 URI instance = new URI();
3688
3689 instance._uri = _uri;
3690 instance._scheme = _scheme;
3691 instance._opaque = _opaque;
3692 instance._authority = _authority;
3693 instance._userinfo = _userinfo;
3694 instance._host = _host;
3695 instance._port = _port;
3696 instance._path = _path;
3697 instance._query = _query;
3698 instance._fragment = _fragment;
3699
3700 instance.protocolCharset = protocolCharset;
3701
3702 instance._is_hier_part = _is_hier_part;
3703 instance._is_opaque_part = _is_opaque_part;
3704 instance._is_net_path = _is_net_path;
3705 instance._is_abs_path = _is_abs_path;
3706 instance._is_rel_path = _is_rel_path;
3707 instance._is_reg_name = _is_reg_name;
3708 instance._is_server = _is_server;
3709 instance._is_hostname = _is_hostname;
3710 instance._is_IPv4address = _is_IPv4address;
3711 instance._is_IPv6reference = _is_IPv6reference;
3712
3713 return instance;
3714 }
3715
3716
3717
3718 /***
3719 * It can be gotten the URI character sequence. It's raw-escaped.
3720 * For the purpose of the protocol to be transported, it will be useful.
3721 * <p>
3722 * It is clearly unwise to use a URL that contains a password which is
3723 * intended to be secret. In particular, the use of a password within
3724 * the 'userinfo' component of a URL is strongly disrecommended except
3725 * in those rare cases where the 'password' parameter is intended to be
3726 * public.
3727 * <p>
3728 * When you want to get each part of the userinfo, you need to use the
3729 * specific methods in the specific URL. It depends on the specific URL.
3730 *
3731 * @return the URI character sequence
3732 */
3733 public char[] getRawURI() {
3734 return _uri;
3735 }
3736
3737
3738 /***
3739 * It can be gotten the URI character sequence. It's escaped.
3740 * For the purpose of the protocol to be transported, it will be useful.
3741 *
3742 * @return the escaped URI string
3743 */
3744 public String getEscapedURI() {
3745 return (_uri == null) ? null : new String(_uri);
3746 }
3747
3748
3749 /***
3750 * It can be gotten the URI character sequence.
3751 *
3752 * @return the original URI string
3753 * @throws URIException incomplete trailing escape pattern or unsupported
3754 * character encoding
3755 * @see #decode
3756 */
3757 public String getURI() throws URIException {
3758 return (_uri == null) ? null : decode(_uri, getProtocolCharset());
3759 }
3760
3761
3762 /***
3763 * Get the URI reference character sequence.
3764 *
3765 * @return the URI reference character sequence
3766 */
3767 public char[] getRawURIReference() {
3768 if (_fragment == null) {
3769 return _uri;
3770 }
3771 if (_uri == null) {
3772 return _fragment;
3773 }
3774
3775 String uriReference = new String(_uri) + "#" + new String(_fragment);
3776 return uriReference.toCharArray();
3777 }
3778
3779
3780 /***
3781 * Get the escaped URI reference string.
3782 *
3783 * @return the escaped URI reference string
3784 */
3785 public String getEscapedURIReference() {
3786 char[] uriReference = getRawURIReference();
3787 return (uriReference == null) ? null : new String(uriReference);
3788 }
3789
3790
3791 /***
3792 * Get the original URI reference string.
3793 *
3794 * @return the original URI reference string
3795 * @throws URIException If {@link #decode} fails.
3796 */
3797 public String getURIReference() throws URIException {
3798 char[] uriReference = getRawURIReference();
3799 return (uriReference == null) ? null : decode(uriReference,
3800 getProtocolCharset());
3801 }
3802
3803
3804 /***
3805 * Get the escaped URI string.
3806 * <p>
3807 * On the document, the URI-reference form is only used without the userinfo
3808 * component like http://jakarta.apache.org/ by the security reason.
3809 * But the URI-reference form with the userinfo component could be parsed.
3810 * <p>
3811 * In other words, this URI and any its subclasses must not expose the
3812 * URI-reference expression with the userinfo component like
3813 * http://user:password@hostport/restricted_zone.<br>
3814 * It means that the API client programmer should extract each user and
3815 * password to access manually. Probably it will be supported in the each
3816 * subclass, however, not a whole URI-reference expression.
3817 *
3818 * @return the escaped URI string
3819 * @see #clone()
3820 */
3821 public String toString() {
3822 return getEscapedURI();
3823 }
3824
3825
3826
3827
3828 /***
3829 * The charset-changed normal operation to represent to be required to
3830 * alert to user the fact the default charset is changed.
3831 */
3832 public static class DefaultCharsetChanged extends RuntimeException {
3833
3834
3835
3836 /***
3837 * The constructor with a reason string and its code arguments.
3838 *
3839 * @param reasonCode the reason code
3840 * @param reason the reason
3841 */
3842 public DefaultCharsetChanged(int reasonCode, String reason) {
3843 super(reason);
3844 this.reason = reason;
3845 this.reasonCode = reasonCode;
3846 }
3847
3848
3849
3850 /*** No specified reason code. */
3851 public static final int UNKNOWN = 0;
3852
3853 /*** Protocol charset changed. */
3854 public static final int PROTOCOL_CHARSET = 1;
3855
3856 /*** Document charset changed. */
3857 public static final int DOCUMENT_CHARSET = 2;
3858
3859
3860
3861 /*** The reason code. */
3862 private int reasonCode;
3863
3864 /*** The reason message. */
3865 private String reason;
3866
3867
3868
3869 /***
3870 * Get the reason code.
3871 *
3872 * @return the reason code
3873 */
3874 public int getReasonCode() {
3875 return reasonCode;
3876 }
3877
3878 /***
3879 * Get the reason message.
3880 *
3881 * @return the reason message
3882 */
3883 public String getReason() {
3884 return reason;
3885 }
3886
3887 }
3888
3889
3890 /***
3891 * A mapping to determine the (somewhat arbitrarily) preferred charset for a
3892 * given locale. Supports all locales recognized in JDK 1.1.
3893 * <p>
3894 * The distribution of this class is Servlets.com. It was originally
3895 * written by Jason Hunter [jhunter at acm.org] and used by with permission.
3896 */
3897 public static class LocaleToCharsetMap {
3898
3899 /*** A mapping of language code to charset */
3900 private static final Hashtable LOCALE_TO_CHARSET_MAP;
3901 static {
3902 LOCALE_TO_CHARSET_MAP = new Hashtable();
3903 LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
3904 LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
3905 LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
3906 LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
3907 LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
3908 LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
3909 LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
3910 LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
3911 LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
3912 LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
3913 LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
3914 LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
3915 LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
3916 LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
3917 LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
3918 LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
3919 LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
3920 LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
3921 LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
3922 LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
3923 LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
3924 LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
3925 LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
3926 LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
3927 LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
3928 LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
3929 LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
3930 LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
3931 LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
3932 LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
3933 LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
3934 LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
3935 LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
3936 LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
3937 LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
3938 LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
3939 LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
3940 LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
3941 LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
3942 }
3943
3944 /***
3945 * Get the preferred charset for the given locale.
3946 *
3947 * @param locale the locale
3948 * @return the preferred charset or null if the locale is not
3949 * recognized.
3950 */
3951 public static String getCharset(Locale locale) {
3952
3953 String charset =
3954 (String) LOCALE_TO_CHARSET_MAP.get(locale.toString());
3955 if (charset != null) {
3956 return charset;
3957 }
3958
3959
3960 charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage());
3961 return charset;
3962 }
3963
3964 }
3965
3966 }
3967