1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.betwixt;
18 /***
19 * <p><code>XMLUtils</code> contains basic utility methods for XML.</p>
20 *
21 * <p>The code for {@link #isWellFormedXMLName} is based on code in
22 * <code>org.apache.xerces.util.XMLChar</code>
23 * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
24 * The authors of this class are credited below.</p>
25 *
26 * @author Glenn Marcy, IBM
27 * @author Andy Clark, IBM
28 * @author Eric Ye, IBM
29 * @author Arnaud Le Hors, IBM
30 * @author Rahul Srivastava, Sun Microsystems Inc.
31 *
32 * @author Robert Burrell Donkin
33 * @since 0.5
34 */
35 public class XMLUtils {
36
37
38
39
40 /*** Escaped <code><</code> entity */
41 public static final String LESS_THAN_ENTITY = "<";
42 /*** Escaped <code>></code> entity */
43 public static final String GREATER_THAN_ENTITY = ">";
44 /*** Escaped <code>&</code> entity */
45 public static final String AMPERSAND_ENTITY = "&";
46 /*** Escaped <code>'</code> entity */
47 public static final String APOSTROPHE_ENTITY = "'";
48 /*** Escaped <code>"</code> entity */
49 public static final String QUOTE_ENTITY = """;
50
51
52 /*** Name start character mask. */
53 private static final int MASK_NAME_START = 0x01;
54 /*** Name character mask. */
55 private static final int MASK_NAME = 0x02;
56
57
58
59
60 /*** Character flags. */
61 private static final byte[] CHARS = new byte[1 << 16];
62
63
64
65
66
67 static {
68
69
70
71
72
73
74 int nameChar[] = {
75 0x002D, 0x002E,
76 };
77
78
79
80
81
82 int nameStartChar[] = {
83 0x003A, 0x005F,
84 };
85
86
87
88
89
90 int letterRange[] = {
91
92 0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
93 0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
94 0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
95 0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
96 0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
97 0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
98 0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
99 0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
100 0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
101 0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
102 0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
103 0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
104 0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
105 0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
106 0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
107 0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
108 0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
109 0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
110 0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
111 0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
112 0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
113 0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
114 0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
115 0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
116 0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
117 0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
118 0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
119 0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
120 0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
121 0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
122 0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
123 0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
124 0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
125 0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
126 0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
127 0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
128 0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
129 0xAC00, 0xD7A3,
130
131 0x3021, 0x3029, 0x4E00, 0x9FA5,
132 };
133 int letterChar[] = {
134
135 0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
136 0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
137 0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
138 0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
139 0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
140 0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
141 0x1F5D, 0x1FBE, 0x2126, 0x212E,
142
143 0x3007,
144 };
145
146
147
148
149
150 int combiningCharRange[] = {
151 0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
152 0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
153 0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
154 0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
155 0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
156 0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
157 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
158 0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
159 0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
160 0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
161 0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
162 0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
163 0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
164 0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
165 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
166 0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
167 0x20D0, 0x20DC, 0x302A, 0x302F,
168 };
169
170 int combiningCharChar[] = {
171 0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
172 0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
173 0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
174 0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
175 };
176
177
178
179
180
181 int digitRange[] = {
182 0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
183 0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
184 0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
185 0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
186 };
187
188
189
190
191
192 int extenderRange[] = {
193 0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
194 };
195
196 int extenderChar[] = {
197 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
198 };
199
200
201
202
203
204
205 for (int i = 0; i < nameStartChar.length; i++) {
206 CHARS[nameStartChar[i]] |= MASK_NAME_START | MASK_NAME;
207 }
208 for (int i = 0; i < letterRange.length; i += 2) {
209 for (int j = letterRange[i]; j <= letterRange[i + 1]; j++) {
210 CHARS[j] |= MASK_NAME_START | MASK_NAME;
211 }
212 }
213 for (int i = 0; i < letterChar.length; i++) {
214 CHARS[letterChar[i]] |= MASK_NAME_START | MASK_NAME;
215 }
216
217
218 for (int i = 0; i < nameChar.length; i++) {
219 CHARS[nameChar[i]] |= MASK_NAME;
220 }
221 for (int i = 0; i < digitRange.length; i += 2) {
222 for (int j = digitRange[i]; j <= digitRange[i + 1]; j++) {
223 CHARS[j] |= MASK_NAME;
224 }
225 }
226 for (int i = 0; i < combiningCharRange.length; i += 2) {
227 for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++) {
228 CHARS[j] |= MASK_NAME;
229 }
230 }
231 for (int i = 0; i < combiningCharChar.length; i++) {
232 CHARS[combiningCharChar[i]] |= MASK_NAME;
233 }
234 for (int i = 0; i < extenderRange.length; i += 2) {
235 for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++) {
236 CHARS[j] |= MASK_NAME;
237 }
238 }
239 for (int i = 0; i < extenderChar.length; i++) {
240 CHARS[extenderChar[i]] |= MASK_NAME;
241 }
242
243 }
244
245
246
247
248 /***
249 * <p>Constructor for use by tools that required <code>JavaBean</code> instances.</p>
250 *
251 * <p>This constructor is public <strong>only</strong>
252 * to permit tools that require a JavaBean instance to operate.
253 * <code>XMLUtils</code> instances should <strong>not</strong> be constructed in standard
254 * programming. Instead, the class methods should be called directly.</p>
255 */
256 public XMLUtils() {}
257
258
259
260
261 /***
262 * <p>Escape the <code>toString</code> of the given object.
263 * For use as body text.</p>
264 *
265 * @param value escape <code>value.toString()</code>
266 * @return text with escaped delimiters
267 */
268 public static final String escapeBodyValue(Object value) {
269 StringBuffer buffer = new StringBuffer(value.toString());
270 for (int i=0, size = buffer.length(); i <size; i++) {
271 switch (buffer.charAt(i)) {
272 case '<':
273 buffer.replace(i, i+1, LESS_THAN_ENTITY);
274 size += 3;
275 i+=3;
276 break;
277 case '>':
278 buffer.replace(i, i+1, GREATER_THAN_ENTITY);
279 size += 3;
280 i += 3;
281 break;
282 case '&':
283 buffer.replace(i, i+1, AMPERSAND_ENTITY);
284 size += 4;
285 i += 4;
286 break;
287 }
288 }
289 return buffer.toString();
290 }
291
292 /***
293 * <p>Escape the <code>toString</code> of the given object.
294 * For use in an attribute value.</p>
295 *
296 * @param value escape <code>value.toString()</code>
297 * @return text with characters restricted (for use in attributes) escaped
298 */
299 public static final String escapeAttributeValue(Object value) {
300 StringBuffer buffer = new StringBuffer(value.toString());
301 for (int i=0, size = buffer.length(); i <size; i++) {
302 switch (buffer.charAt(i)) {
303 case '<':
304 buffer.replace(i, i+1, LESS_THAN_ENTITY);
305 size += 3;
306 i+=3;
307 break;
308 case '>':
309 buffer.replace(i, i+1, GREATER_THAN_ENTITY);
310 size += 3;
311 i += 3;
312 break;
313 case '&':
314 buffer.replace(i, i+1, AMPERSAND_ENTITY);
315 size += 4;
316 i += 4;
317 break;
318 case '\'':
319 buffer.replace(i, i+1, APOSTROPHE_ENTITY);
320 size += 5;
321 i += 5;
322 break;
323 case '\"':
324 buffer.replace(i, i+1, QUOTE_ENTITY);
325 size += 5;
326 i += 5;
327 break;
328 }
329 }
330 return buffer.toString();
331 }
332
333
334 /***
335 * Escapes the given content suitable for insertion within a
336 * <code>CDATA</code> sequence.
337 * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
338 * string ']]>' is recognized as markup.
339 * @param content the body content whose character data should
340 * be escaped in a way appropriate for use within a <code>CDATA</code>
341 * section of xml.
342 * @return escaped character data, not null
343 */
344 public static final String escapeCDATAContent(String content) {
345 StringBuffer buffer = new StringBuffer(content);
346 escapeCDATAContent(buffer);
347 return buffer.toString();
348 }
349
350 /***
351 * Escapes the given content suitable for insertion within a
352 * <code>CDATA</code> sequence.
353 * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
354 * string ']]>' is recognized as markup.
355 * @param bufferedContent the body content within a buffer
356 * whose character data should
357 * be escaped in a way appropriate for use within a <code>CDATA</code>
358 * section of xml
359 */
360 public static final void escapeCDATAContent(StringBuffer bufferedContent) {
361 for (int i=2, size = bufferedContent.length(); i<size; i++) {
362 char at = bufferedContent.charAt(i);
363 if ( at == '>'
364 && bufferedContent.charAt(i-1) == ']'
365 && bufferedContent.charAt(i-2) == ']') {
366
367 bufferedContent.replace(i, i+1, GREATER_THAN_ENTITY);
368 size += 3;
369 i+=3;
370 }
371 }
372 }
373
374
375 /***
376 * <p>Is this string a well formed xml name?</p>
377 *
378 * <p>Only certain characters are allowed in well formed element and attribute
379 * names in xml. For example, white space is not allowed in a name.</p>
380 *
381 * <p>The code for this method is based on code in
382 * <code>org.apache.xerces.util.XMLChar</code>
383 * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
384 * The authors of this class are credited at the top of this class.</p>
385 *
386 * @param name the <code>String</code> to be checked for use as an xml attribute
387 * or element name. Returns false if <code>name</code> is null
388 * @return true if this string would be a well-formed name
389 */
390 public static boolean isWellFormedXMLName( String name ) {
391 if ( name == null ) {
392 return false;
393 }
394
395 if ( name.length() == 0 ) {
396 return false;
397 }
398
399 char ch = name.charAt(0);
400 if( isNameStartChar(ch) == false) {
401 return false;
402
403 }
404
405 for (int i = 1; i < name.length(); i++ ) {
406 ch = name.charAt(i);
407 if( isNameChar( ch ) == false ) {
408 return false;
409 }
410 }
411 return true;
412 }
413
414 /***
415 * Returns true if the specified character is a valid name
416 * character as defined by the XML 1.0 specification.
417 *
418 * @param c The character to check.
419 * @return true if this is an XML name character
420 */
421 public static boolean isNameChar(int c) {
422 return c < 0x10000 && (CHARS[c] & MASK_NAME) != 0;
423 }
424
425 /***
426 * Returns true if the specified character is a valid name start
427 * character as defined in the XML 1.0 specification.
428 *
429 * @param c The character to check.
430 * @return trus if this is an XML name start character
431 */
432 public static boolean isNameStartChar(int c) {
433 return c < 0x10000 && (CHARS[c] & MASK_NAME_START) != 0;
434 }
435 }