1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.struts2.jasper.xmlparser;
19
20 import com.opensymphony.xwork2.util.logging.Logger;
21 import com.opensymphony.xwork2.util.logging.LoggerFactory;
22 import org.apache.struts2.jasper.compiler.Localizer;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.io.UTFDataFormatException;
28
29 /***
30 * @author Andy Clark, IBM
31 * @version $Id: UTF8Reader.java 466606 2006-10-21 23:07:12Z markt $
32 */
33 public class UTF8Reader
34 extends Reader {
35
36 private Logger log = LoggerFactory.getLogger(UTF8Reader.class);
37
38
39
40
41
42 /***
43 * Default byte buffer size (2048).
44 */
45 public static final int DEFAULT_BUFFER_SIZE = 2048;
46
47
48
49 /***
50 * Debug read.
51 */
52 private static final boolean DEBUG_READ = false;
53
54
55
56
57
58 /***
59 * Input stream.
60 */
61 protected InputStream fInputStream;
62
63 /***
64 * Byte buffer.
65 */
66 protected byte[] fBuffer;
67
68 /***
69 * Offset into buffer.
70 */
71 protected int fOffset;
72
73 /***
74 * Surrogate character.
75 */
76 private int fSurrogate = -1;
77
78
79
80
81
82 /***
83 * Constructs a UTF-8 reader from the specified input stream,
84 * buffer size and MessageFormatter.
85 *
86 * @param inputStream The input stream.
87 * @param size The initial buffer size.
88 */
89 public UTF8Reader(InputStream inputStream, int size) {
90 fInputStream = inputStream;
91 fBuffer = new byte[size];
92 }
93
94
95
96
97
98 /***
99 * Read a single character. This method will block until a character is
100 * available, an I/O error occurs, or the end of the stream is reached.
101 * <p/>
102 * <p> Subclasses that intend to support efficient single-character input
103 * should override this method.
104 *
105 * @return The character read, as an integer in the range 0 to 16383
106 * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
107 * been reached
108 * @throws IOException If an I/O error occurs
109 */
110 public int read() throws IOException {
111
112
113 int c = fSurrogate;
114 if (fSurrogate == -1) {
115
116
117 int index = 0;
118
119
120 int b0 = index == fOffset
121 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
122 if (b0 == -1) {
123 return -1;
124 }
125
126
127
128 if (b0 < 0x80) {
129 c = (char) b0;
130 }
131
132
133
134 else if ((b0 & 0xE0) == 0xC0) {
135 int b1 = index == fOffset
136 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
137 if (b1 == -1) {
138 expectedByte(2, 2);
139 }
140 if ((b1 & 0xC0) != 0x80) {
141 invalidByte(2, 2, b1);
142 }
143 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
144 }
145
146
147
148 else if ((b0 & 0xF0) == 0xE0) {
149 int b1 = index == fOffset
150 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
151 if (b1 == -1) {
152 expectedByte(2, 3);
153 }
154 if ((b1 & 0xC0) != 0x80) {
155 invalidByte(2, 3, b1);
156 }
157 int b2 = index == fOffset
158 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
159 if (b2 == -1) {
160 expectedByte(3, 3);
161 }
162 if ((b2 & 0xC0) != 0x80) {
163 invalidByte(3, 3, b2);
164 }
165 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
166 (b2 & 0x003F);
167 }
168
169
170
171
172
173 else if ((b0 & 0xF8) == 0xF0) {
174 int b1 = index == fOffset
175 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
176 if (b1 == -1) {
177 expectedByte(2, 4);
178 }
179 if ((b1 & 0xC0) != 0x80) {
180 invalidByte(2, 3, b1);
181 }
182 int b2 = index == fOffset
183 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
184 if (b2 == -1) {
185 expectedByte(3, 4);
186 }
187 if ((b2 & 0xC0) != 0x80) {
188 invalidByte(3, 3, b2);
189 }
190 int b3 = index == fOffset
191 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
192 if (b3 == -1) {
193 expectedByte(4, 4);
194 }
195 if ((b3 & 0xC0) != 0x80) {
196 invalidByte(4, 4, b3);
197 }
198 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
199 if (uuuuu > 0x10) {
200 invalidSurrogate(uuuuu);
201 }
202 int wwww = uuuuu - 1;
203 int hs = 0xD800 |
204 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
205 ((b2 >> 4) & 0x0003);
206 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
207 c = hs;
208 fSurrogate = ls;
209 }
210
211
212 else {
213 invalidByte(1, 1, b0);
214 }
215 }
216
217
218 else {
219 fSurrogate = -1;
220 }
221
222
223 if (DEBUG_READ) {
224 if (log.isDebugEnabled())
225 log.debug("read(): 0x" + Integer.toHexString(c));
226 }
227 return c;
228
229 }
230
231 /***
232 * Read characters into a portion of an array. This method will block
233 * until some input is available, an I/O error occurs, or the end of the
234 * stream is reached.
235 *
236 * @param ch Destination buffer
237 * @param offset Offset at which to start storing characters
238 * @param length Maximum number of characters to read
239 * @return The number of characters read, or -1 if the end of the
240 * stream has been reached
241 * @throws IOException If an I/O error occurs
242 */
243 public int read(char ch[], int offset, int length) throws IOException {
244
245
246 int out = offset;
247 if (fSurrogate != -1) {
248 ch[offset + 1] = (char) fSurrogate;
249 fSurrogate = -1;
250 length--;
251 out++;
252 }
253
254
255 int count = 0;
256 if (fOffset == 0) {
257
258 if (length > fBuffer.length) {
259 length = fBuffer.length;
260 }
261
262
263 count = fInputStream.read(fBuffer, 0, length);
264 if (count == -1) {
265 return -1;
266 }
267 count += out - offset;
268 }
269
270
271
272
273
274
275
276
277 else {
278 count = fOffset;
279 fOffset = 0;
280 }
281
282
283 final int total = count;
284 for (int in = 0; in < total; in++) {
285 int b0 = fBuffer[in] & 0x00FF;
286
287
288
289 if (b0 < 0x80) {
290 ch[out++] = (char) b0;
291 continue;
292 }
293
294
295
296 if ((b0 & 0xE0) == 0xC0) {
297 int b1 = -1;
298 if (++in < total) {
299 b1 = fBuffer[in] & 0x00FF;
300 } else {
301 b1 = fInputStream.read();
302 if (b1 == -1) {
303 if (out > offset) {
304 fBuffer[0] = (byte) b0;
305 fOffset = 1;
306 return out - offset;
307 }
308 expectedByte(2, 2);
309 }
310 count++;
311 }
312 if ((b1 & 0xC0) != 0x80) {
313 if (out > offset) {
314 fBuffer[0] = (byte) b0;
315 fBuffer[1] = (byte) b1;
316 fOffset = 2;
317 return out - offset;
318 }
319 invalidByte(2, 2, b1);
320 }
321 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
322 ch[out++] = (char) c;
323 count -= 1;
324 continue;
325 }
326
327
328
329 if ((b0 & 0xF0) == 0xE0) {
330 int b1 = -1;
331 if (++in < total) {
332 b1 = fBuffer[in] & 0x00FF;
333 } else {
334 b1 = fInputStream.read();
335 if (b1 == -1) {
336 if (out > offset) {
337 fBuffer[0] = (byte) b0;
338 fOffset = 1;
339 return out - offset;
340 }
341 expectedByte(2, 3);
342 }
343 count++;
344 }
345 if ((b1 & 0xC0) != 0x80) {
346 if (out > offset) {
347 fBuffer[0] = (byte) b0;
348 fBuffer[1] = (byte) b1;
349 fOffset = 2;
350 return out - offset;
351 }
352 invalidByte(2, 3, b1);
353 }
354 int b2 = -1;
355 if (++in < total) {
356 b2 = fBuffer[in] & 0x00FF;
357 } else {
358 b2 = fInputStream.read();
359 if (b2 == -1) {
360 if (out > offset) {
361 fBuffer[0] = (byte) b0;
362 fBuffer[1] = (byte) b1;
363 fOffset = 2;
364 return out - offset;
365 }
366 expectedByte(3, 3);
367 }
368 count++;
369 }
370 if ((b2 & 0xC0) != 0x80) {
371 if (out > offset) {
372 fBuffer[0] = (byte) b0;
373 fBuffer[1] = (byte) b1;
374 fBuffer[2] = (byte) b2;
375 fOffset = 3;
376 return out - offset;
377 }
378 invalidByte(3, 3, b2);
379 }
380 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
381 (b2 & 0x003F);
382 ch[out++] = (char) c;
383 count -= 2;
384 continue;
385 }
386
387
388
389
390
391 if ((b0 & 0xF8) == 0xF0) {
392 int b1 = -1;
393 if (++in < total) {
394 b1 = fBuffer[in] & 0x00FF;
395 } else {
396 b1 = fInputStream.read();
397 if (b1 == -1) {
398 if (out > offset) {
399 fBuffer[0] = (byte) b0;
400 fOffset = 1;
401 return out - offset;
402 }
403 expectedByte(2, 4);
404 }
405 count++;
406 }
407 if ((b1 & 0xC0) != 0x80) {
408 if (out > offset) {
409 fBuffer[0] = (byte) b0;
410 fBuffer[1] = (byte) b1;
411 fOffset = 2;
412 return out - offset;
413 }
414 invalidByte(2, 4, b1);
415 }
416 int b2 = -1;
417 if (++in < total) {
418 b2 = fBuffer[in] & 0x00FF;
419 } else {
420 b2 = fInputStream.read();
421 if (b2 == -1) {
422 if (out > offset) {
423 fBuffer[0] = (byte) b0;
424 fBuffer[1] = (byte) b1;
425 fOffset = 2;
426 return out - offset;
427 }
428 expectedByte(3, 4);
429 }
430 count++;
431 }
432 if ((b2 & 0xC0) != 0x80) {
433 if (out > offset) {
434 fBuffer[0] = (byte) b0;
435 fBuffer[1] = (byte) b1;
436 fBuffer[2] = (byte) b2;
437 fOffset = 3;
438 return out - offset;
439 }
440 invalidByte(3, 4, b2);
441 }
442 int b3 = -1;
443 if (++in < total) {
444 b3 = fBuffer[in] & 0x00FF;
445 } else {
446 b3 = fInputStream.read();
447 if (b3 == -1) {
448 if (out > offset) {
449 fBuffer[0] = (byte) b0;
450 fBuffer[1] = (byte) b1;
451 fBuffer[2] = (byte) b2;
452 fOffset = 3;
453 return out - offset;
454 }
455 expectedByte(4, 4);
456 }
457 count++;
458 }
459 if ((b3 & 0xC0) != 0x80) {
460 if (out > offset) {
461 fBuffer[0] = (byte) b0;
462 fBuffer[1] = (byte) b1;
463 fBuffer[2] = (byte) b2;
464 fBuffer[3] = (byte) b3;
465 fOffset = 4;
466 return out - offset;
467 }
468 invalidByte(4, 4, b2);
469 }
470
471
472 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
473 if (uuuuu > 0x10) {
474 invalidSurrogate(uuuuu);
475 }
476 int wwww = uuuuu - 1;
477 int zzzz = b1 & 0x000F;
478 int yyyyyy = b2 & 0x003F;
479 int xxxxxx = b3 & 0x003F;
480 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
481 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
482
483
484 ch[out++] = (char) hs;
485 ch[out++] = (char) ls;
486 count -= 2;
487 continue;
488 }
489
490
491 if (out > offset) {
492 fBuffer[0] = (byte) b0;
493 fOffset = 1;
494 return out - offset;
495 }
496 invalidByte(1, 1, b0);
497 }
498
499
500 if (DEBUG_READ) {
501 if (log.isDebugEnabled())
502 log.debug("read(char[]," + offset + ',' + length + "): count=" + count);
503 }
504 return count;
505
506 }
507
508 /***
509 * Skip characters. This method will block until some characters are
510 * available, an I/O error occurs, or the end of the stream is reached.
511 *
512 * @param n The number of characters to skip
513 * @return The number of characters actually skipped
514 * @throws IOException If an I/O error occurs
515 */
516 public long skip(long n) throws IOException {
517
518 long remaining = n;
519 final char[] ch = new char[fBuffer.length];
520 do {
521 int length = ch.length < remaining ? ch.length : (int) remaining;
522 int count = read(ch, 0, length);
523 if (count > 0) {
524 remaining -= count;
525 } else {
526 break;
527 }
528 } while (remaining > 0);
529
530 long skipped = n - remaining;
531 return skipped;
532
533 }
534
535 /***
536 * Tell whether this stream is ready to be read.
537 *
538 * @return True if the next read() is guaranteed not to block for input,
539 * false otherwise. Note that returning false does not guarantee that the
540 * next read will block.
541 * @throws IOException If an I/O error occurs
542 */
543 public boolean ready() throws IOException {
544 return false;
545 }
546
547 /***
548 * Tell whether this stream supports the mark() operation.
549 */
550 public boolean markSupported() {
551 return false;
552 }
553
554 /***
555 * Mark the present position in the stream. Subsequent calls to reset()
556 * will attempt to reposition the stream to this point. Not all
557 * character-input streams support the mark() operation.
558 *
559 * @param readAheadLimit Limit on the number of characters that may be
560 * read while still preserving the mark. After
561 * reading this many characters, attempting to
562 * reset the stream may fail.
563 * @throws IOException If the stream does not support mark(),
564 * or if some other I/O error occurs
565 */
566 public void mark(int readAheadLimit) throws IOException {
567 throw new IOException(
568 Localizer.getMessage("jsp.error.xml.operationNotSupported",
569 "mark()", "UTF-8"));
570 }
571
572 /***
573 * Reset the stream. If the stream has been marked, then attempt to
574 * reposition it at the mark. If the stream has not been marked, then
575 * attempt to reset it in some way appropriate to the particular stream,
576 * for example by repositioning it to its starting point. Not all
577 * character-input streams support the reset() operation, and some support
578 * reset() without supporting mark().
579 *
580 * @throws IOException If the stream has not been marked,
581 * or if the mark has been invalidated,
582 * or if the stream does not support reset(),
583 * or if some other I/O error occurs
584 */
585 public void reset() throws IOException {
586 fOffset = 0;
587 fSurrogate = -1;
588 }
589
590 /***
591 * Close the stream. Once a stream has been closed, further read(),
592 * ready(), mark(), or reset() invocations will throw an IOException.
593 * Closing a previously-closed stream, however, has no effect.
594 *
595 * @throws IOException If an I/O error occurs
596 */
597 public void close() throws IOException {
598 fInputStream.close();
599 }
600
601
602
603
604
605 /***
606 * Throws an exception for expected byte.
607 */
608 private void expectedByte(int position, int count)
609 throws UTFDataFormatException {
610
611 throw new UTFDataFormatException(
612 Localizer.getMessage("jsp.error.xml.expectedByte",
613 Integer.toString(position),
614 Integer.toString(count)));
615
616 }
617
618 /***
619 * Throws an exception for invalid byte.
620 */
621 private void invalidByte(int position, int count, int c)
622 throws UTFDataFormatException {
623
624 throw new UTFDataFormatException(
625 Localizer.getMessage("jsp.error.xml.invalidByte",
626 Integer.toString(position),
627 Integer.toString(count)));
628 }
629
630 /***
631 * Throws an exception for invalid surrogate bits.
632 */
633 private void invalidSurrogate(int uuuuu) throws UTFDataFormatException {
634
635 throw new UTFDataFormatException(
636 Localizer.getMessage("jsp.error.xml.invalidHighSurrogate",
637 Integer.toHexString(uuuuu)));
638 }
639
640 }