View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.struts2.jasper.xmlparser;
19  
20  import com.opensymphony.xwork2.util.logging.Logger;
21  import com.opensymphony.xwork2.util.logging.LoggerFactory;
22  import org.apache.struts2.jasper.compiler.Localizer;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.io.UTFDataFormatException;
28  
29  /***
30   * @author Andy Clark, IBM
31   * @version $Id: UTF8Reader.java 466606 2006-10-21 23:07:12Z markt $
32   */
33  public class UTF8Reader
34          extends Reader {
35  
36      private Logger log = LoggerFactory.getLogger(UTF8Reader.class);
37  
38      //
39      // Constants
40      //
41  
42      /***
43       * Default byte buffer size (2048).
44       */
45      public static final int DEFAULT_BUFFER_SIZE = 2048;
46  
47      // debugging
48  
49      /***
50       * Debug read.
51       */
52      private static final boolean DEBUG_READ = false;
53  
54      //
55      // Data
56      //
57  
58      /***
59       * Input stream.
60       */
61      protected InputStream fInputStream;
62  
63      /***
64       * Byte buffer.
65       */
66      protected byte[] fBuffer;
67  
68      /***
69       * Offset into buffer.
70       */
71      protected int fOffset;
72  
73      /***
74       * Surrogate character.
75       */
76      private int fSurrogate = -1;
77  
78      //
79      // Constructors
80      //
81  
82      /***
83       * Constructs a UTF-8 reader from the specified input stream,
84       * buffer size and MessageFormatter.
85       *
86       * @param inputStream The input stream.
87       * @param size        The initial buffer size.
88       */
89      public UTF8Reader(InputStream inputStream, int size) {
90          fInputStream = inputStream;
91          fBuffer = new byte[size];
92      }
93  
94      //
95      // Reader methods
96      //
97  
98      /***
99       * Read a single character.  This method will block until a character is
100      * available, an I/O error occurs, or the end of the stream is reached.
101      * <p/>
102      * <p> Subclasses that intend to support efficient single-character input
103      * should override this method.
104      *
105      * @return The character read, as an integer in the range 0 to 16383
106      *         (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
107      *         been reached
108      * @throws IOException If an I/O error occurs
109      */
110     public int read() throws IOException {
111 
112         // decode character
113         int c = fSurrogate;
114         if (fSurrogate == -1) {
115             // NOTE: We use the index into the buffer if there are remaining
116             //       bytes from the last block read. -Ac
117             int index = 0;
118 
119             // get first byte
120             int b0 = index == fOffset
121                     ? fInputStream.read() : fBuffer[index++] & 0x00FF;
122             if (b0 == -1) {
123                 return -1;
124             }
125 
126             // UTF-8:   [0xxx xxxx]
127             // Unicode: [0000 0000] [0xxx xxxx]
128             if (b0 < 0x80) {
129                 c = (char) b0;
130             }
131 
132             // UTF-8:   [110y yyyy] [10xx xxxx]
133             // Unicode: [0000 0yyy] [yyxx xxxx]
134             else if ((b0 & 0xE0) == 0xC0) {
135                 int b1 = index == fOffset
136                         ? fInputStream.read() : fBuffer[index++] & 0x00FF;
137                 if (b1 == -1) {
138                     expectedByte(2, 2);
139                 }
140                 if ((b1 & 0xC0) != 0x80) {
141                     invalidByte(2, 2, b1);
142                 }
143                 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
144             }
145 
146             // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
147             // Unicode: [zzzz yyyy] [yyxx xxxx]
148             else if ((b0 & 0xF0) == 0xE0) {
149                 int b1 = index == fOffset
150                         ? fInputStream.read() : fBuffer[index++] & 0x00FF;
151                 if (b1 == -1) {
152                     expectedByte(2, 3);
153                 }
154                 if ((b1 & 0xC0) != 0x80) {
155                     invalidByte(2, 3, b1);
156                 }
157                 int b2 = index == fOffset
158                         ? fInputStream.read() : fBuffer[index++] & 0x00FF;
159                 if (b2 == -1) {
160                     expectedByte(3, 3);
161                 }
162                 if ((b2 & 0xC0) != 0x80) {
163                     invalidByte(3, 3, b2);
164                 }
165                 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
166                         (b2 & 0x003F);
167             }
168 
169             // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
170             // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
171             //          [1101 11yy] [yyxx xxxx] (low surrogate)
172             //          * uuuuu = wwww + 1
173             else if ((b0 & 0xF8) == 0xF0) {
174                 int b1 = index == fOffset
175                         ? fInputStream.read() : fBuffer[index++] & 0x00FF;
176                 if (b1 == -1) {
177                     expectedByte(2, 4);
178                 }
179                 if ((b1 & 0xC0) != 0x80) {
180                     invalidByte(2, 3, b1);
181                 }
182                 int b2 = index == fOffset
183                         ? fInputStream.read() : fBuffer[index++] & 0x00FF;
184                 if (b2 == -1) {
185                     expectedByte(3, 4);
186                 }
187                 if ((b2 & 0xC0) != 0x80) {
188                     invalidByte(3, 3, b2);
189                 }
190                 int b3 = index == fOffset
191                         ? fInputStream.read() : fBuffer[index++] & 0x00FF;
192                 if (b3 == -1) {
193                     expectedByte(4, 4);
194                 }
195                 if ((b3 & 0xC0) != 0x80) {
196                     invalidByte(4, 4, b3);
197                 }
198                 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
199                 if (uuuuu > 0x10) {
200                     invalidSurrogate(uuuuu);
201                 }
202                 int wwww = uuuuu - 1;
203                 int hs = 0xD800 |
204                         ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
205                         ((b2 >> 4) & 0x0003);
206                 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
207                 c = hs;
208                 fSurrogate = ls;
209             }
210 
211             // error
212             else {
213                 invalidByte(1, 1, b0);
214             }
215         }
216 
217         // use surrogate
218         else {
219             fSurrogate = -1;
220         }
221 
222         // return character
223         if (DEBUG_READ) {
224             if (log.isDebugEnabled())
225                 log.debug("read(): 0x" + Integer.toHexString(c));
226         }
227         return c;
228 
229     } // read():int
230 
231     /***
232      * Read characters into a portion of an array.  This method will block
233      * until some input is available, an I/O error occurs, or the end of the
234      * stream is reached.
235      *
236      * @param ch     Destination buffer
237      * @param offset Offset at which to start storing characters
238      * @param length Maximum number of characters to read
239      * @return The number of characters read, or -1 if the end of the
240      *         stream has been reached
241      * @throws IOException If an I/O error occurs
242      */
243     public int read(char ch[], int offset, int length) throws IOException {
244 
245         // handle surrogate
246         int out = offset;
247         if (fSurrogate != -1) {
248             ch[offset + 1] = (char) fSurrogate;
249             fSurrogate = -1;
250             length--;
251             out++;
252         }
253 
254         // read bytes
255         int count = 0;
256         if (fOffset == 0) {
257             // adjust length to read
258             if (length > fBuffer.length) {
259                 length = fBuffer.length;
260             }
261 
262             // perform read operation
263             count = fInputStream.read(fBuffer, 0, length);
264             if (count == -1) {
265                 return -1;
266             }
267             count += out - offset;
268         }
269 
270         // skip read; last character was in error
271         // NOTE: Having an offset value other than zero means that there was
272         //       an error in the last character read. In this case, we have
273         //       skipped the read so we don't consume any bytes past the 
274         //       error. By signalling the error on the next block read we
275         //       allow the method to return the most valid characters that
276         //       it can on the previous block read. -Ac
277         else {
278             count = fOffset;
279             fOffset = 0;
280         }
281 
282         // convert bytes to characters
283         final int total = count;
284         for (int in = 0; in < total; in++) {
285             int b0 = fBuffer[in] & 0x00FF;
286 
287             // UTF-8:   [0xxx xxxx]
288             // Unicode: [0000 0000] [0xxx xxxx]
289             if (b0 < 0x80) {
290                 ch[out++] = (char) b0;
291                 continue;
292             }
293 
294             // UTF-8:   [110y yyyy] [10xx xxxx]
295             // Unicode: [0000 0yyy] [yyxx xxxx]
296             if ((b0 & 0xE0) == 0xC0) {
297                 int b1 = -1;
298                 if (++in < total) {
299                     b1 = fBuffer[in] & 0x00FF;
300                 } else {
301                     b1 = fInputStream.read();
302                     if (b1 == -1) {
303                         if (out > offset) {
304                             fBuffer[0] = (byte) b0;
305                             fOffset = 1;
306                             return out - offset;
307                         }
308                         expectedByte(2, 2);
309                     }
310                     count++;
311                 }
312                 if ((b1 & 0xC0) != 0x80) {
313                     if (out > offset) {
314                         fBuffer[0] = (byte) b0;
315                         fBuffer[1] = (byte) b1;
316                         fOffset = 2;
317                         return out - offset;
318                     }
319                     invalidByte(2, 2, b1);
320                 }
321                 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
322                 ch[out++] = (char) c;
323                 count -= 1;
324                 continue;
325             }
326 
327             // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
328             // Unicode: [zzzz yyyy] [yyxx xxxx]
329             if ((b0 & 0xF0) == 0xE0) {
330                 int b1 = -1;
331                 if (++in < total) {
332                     b1 = fBuffer[in] & 0x00FF;
333                 } else {
334                     b1 = fInputStream.read();
335                     if (b1 == -1) {
336                         if (out > offset) {
337                             fBuffer[0] = (byte) b0;
338                             fOffset = 1;
339                             return out - offset;
340                         }
341                         expectedByte(2, 3);
342                     }
343                     count++;
344                 }
345                 if ((b1 & 0xC0) != 0x80) {
346                     if (out > offset) {
347                         fBuffer[0] = (byte) b0;
348                         fBuffer[1] = (byte) b1;
349                         fOffset = 2;
350                         return out - offset;
351                     }
352                     invalidByte(2, 3, b1);
353                 }
354                 int b2 = -1;
355                 if (++in < total) {
356                     b2 = fBuffer[in] & 0x00FF;
357                 } else {
358                     b2 = fInputStream.read();
359                     if (b2 == -1) {
360                         if (out > offset) {
361                             fBuffer[0] = (byte) b0;
362                             fBuffer[1] = (byte) b1;
363                             fOffset = 2;
364                             return out - offset;
365                         }
366                         expectedByte(3, 3);
367                     }
368                     count++;
369                 }
370                 if ((b2 & 0xC0) != 0x80) {
371                     if (out > offset) {
372                         fBuffer[0] = (byte) b0;
373                         fBuffer[1] = (byte) b1;
374                         fBuffer[2] = (byte) b2;
375                         fOffset = 3;
376                         return out - offset;
377                     }
378                     invalidByte(3, 3, b2);
379                 }
380                 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
381                         (b2 & 0x003F);
382                 ch[out++] = (char) c;
383                 count -= 2;
384                 continue;
385             }
386 
387             // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
388             // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
389             //          [1101 11yy] [yyxx xxxx] (low surrogate)
390             //          * uuuuu = wwww + 1
391             if ((b0 & 0xF8) == 0xF0) {
392                 int b1 = -1;
393                 if (++in < total) {
394                     b1 = fBuffer[in] & 0x00FF;
395                 } else {
396                     b1 = fInputStream.read();
397                     if (b1 == -1) {
398                         if (out > offset) {
399                             fBuffer[0] = (byte) b0;
400                             fOffset = 1;
401                             return out - offset;
402                         }
403                         expectedByte(2, 4);
404                     }
405                     count++;
406                 }
407                 if ((b1 & 0xC0) != 0x80) {
408                     if (out > offset) {
409                         fBuffer[0] = (byte) b0;
410                         fBuffer[1] = (byte) b1;
411                         fOffset = 2;
412                         return out - offset;
413                     }
414                     invalidByte(2, 4, b1);
415                 }
416                 int b2 = -1;
417                 if (++in < total) {
418                     b2 = fBuffer[in] & 0x00FF;
419                 } else {
420                     b2 = fInputStream.read();
421                     if (b2 == -1) {
422                         if (out > offset) {
423                             fBuffer[0] = (byte) b0;
424                             fBuffer[1] = (byte) b1;
425                             fOffset = 2;
426                             return out - offset;
427                         }
428                         expectedByte(3, 4);
429                     }
430                     count++;
431                 }
432                 if ((b2 & 0xC0) != 0x80) {
433                     if (out > offset) {
434                         fBuffer[0] = (byte) b0;
435                         fBuffer[1] = (byte) b1;
436                         fBuffer[2] = (byte) b2;
437                         fOffset = 3;
438                         return out - offset;
439                     }
440                     invalidByte(3, 4, b2);
441                 }
442                 int b3 = -1;
443                 if (++in < total) {
444                     b3 = fBuffer[in] & 0x00FF;
445                 } else {
446                     b3 = fInputStream.read();
447                     if (b3 == -1) {
448                         if (out > offset) {
449                             fBuffer[0] = (byte) b0;
450                             fBuffer[1] = (byte) b1;
451                             fBuffer[2] = (byte) b2;
452                             fOffset = 3;
453                             return out - offset;
454                         }
455                         expectedByte(4, 4);
456                     }
457                     count++;
458                 }
459                 if ((b3 & 0xC0) != 0x80) {
460                     if (out > offset) {
461                         fBuffer[0] = (byte) b0;
462                         fBuffer[1] = (byte) b1;
463                         fBuffer[2] = (byte) b2;
464                         fBuffer[3] = (byte) b3;
465                         fOffset = 4;
466                         return out - offset;
467                     }
468                     invalidByte(4, 4, b2);
469                 }
470 
471                 // decode bytes into surrogate characters
472                 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
473                 if (uuuuu > 0x10) {
474                     invalidSurrogate(uuuuu);
475                 }
476                 int wwww = uuuuu - 1;
477                 int zzzz = b1 & 0x000F;
478                 int yyyyyy = b2 & 0x003F;
479                 int xxxxxx = b3 & 0x003F;
480                 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
481                 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
482 
483                 // set characters
484                 ch[out++] = (char) hs;
485                 ch[out++] = (char) ls;
486                 count -= 2;
487                 continue;
488             }
489 
490             // error
491             if (out > offset) {
492                 fBuffer[0] = (byte) b0;
493                 fOffset = 1;
494                 return out - offset;
495             }
496             invalidByte(1, 1, b0);
497         }
498 
499         // return number of characters converted
500         if (DEBUG_READ) {
501             if (log.isDebugEnabled())
502                 log.debug("read(char[]," + offset + ',' + length + "): count=" + count);
503         }
504         return count;
505 
506     } // read(char[],int,int)
507 
508     /***
509      * Skip characters.  This method will block until some characters are
510      * available, an I/O error occurs, or the end of the stream is reached.
511      *
512      * @param n The number of characters to skip
513      * @return The number of characters actually skipped
514      * @throws IOException If an I/O error occurs
515      */
516     public long skip(long n) throws IOException {
517 
518         long remaining = n;
519         final char[] ch = new char[fBuffer.length];
520         do {
521             int length = ch.length < remaining ? ch.length : (int) remaining;
522             int count = read(ch, 0, length);
523             if (count > 0) {
524                 remaining -= count;
525             } else {
526                 break;
527             }
528         } while (remaining > 0);
529 
530         long skipped = n - remaining;
531         return skipped;
532 
533     } // skip(long):long
534 
535     /***
536      * Tell whether this stream is ready to be read.
537      *
538      * @return True if the next read() is guaranteed not to block for input,
539      *         false otherwise.  Note that returning false does not guarantee that the
540      *         next read will block.
541      * @throws IOException If an I/O error occurs
542      */
543     public boolean ready() throws IOException {
544         return false;
545     } // ready()
546 
547     /***
548      * Tell whether this stream supports the mark() operation.
549      */
550     public boolean markSupported() {
551         return false;
552     } // markSupported()
553 
554     /***
555      * Mark the present position in the stream.  Subsequent calls to reset()
556      * will attempt to reposition the stream to this point.  Not all
557      * character-input streams support the mark() operation.
558      *
559      * @param readAheadLimit Limit on the number of characters that may be
560      *                       read while still preserving the mark.  After
561      *                       reading this many characters, attempting to
562      *                       reset the stream may fail.
563      * @throws IOException If the stream does not support mark(),
564      *                     or if some other I/O error occurs
565      */
566     public void mark(int readAheadLimit) throws IOException {
567         throw new IOException(
568                 Localizer.getMessage("jsp.error.xml.operationNotSupported",
569                         "mark()", "UTF-8"));
570     }
571 
572     /***
573      * Reset the stream.  If the stream has been marked, then attempt to
574      * reposition it at the mark.  If the stream has not been marked, then
575      * attempt to reset it in some way appropriate to the particular stream,
576      * for example by repositioning it to its starting point.  Not all
577      * character-input streams support the reset() operation, and some support
578      * reset() without supporting mark().
579      *
580      * @throws IOException If the stream has not been marked,
581      *                     or if the mark has been invalidated,
582      *                     or if the stream does not support reset(),
583      *                     or if some other I/O error occurs
584      */
585     public void reset() throws IOException {
586         fOffset = 0;
587         fSurrogate = -1;
588     } // reset()
589 
590     /***
591      * Close the stream.  Once a stream has been closed, further read(),
592      * ready(), mark(), or reset() invocations will throw an IOException.
593      * Closing a previously-closed stream, however, has no effect.
594      *
595      * @throws IOException If an I/O error occurs
596      */
597     public void close() throws IOException {
598         fInputStream.close();
599     } // close()
600 
601     //
602     // Private methods
603     //
604 
605     /***
606      * Throws an exception for expected byte.
607      */
608     private void expectedByte(int position, int count)
609             throws UTFDataFormatException {
610 
611         throw new UTFDataFormatException(
612                 Localizer.getMessage("jsp.error.xml.expectedByte",
613                         Integer.toString(position),
614                         Integer.toString(count)));
615 
616     } // expectedByte(int,int,int)
617 
618     /***
619      * Throws an exception for invalid byte.
620      */
621     private void invalidByte(int position, int count, int c)
622             throws UTFDataFormatException {
623 
624         throw new UTFDataFormatException(
625                 Localizer.getMessage("jsp.error.xml.invalidByte",
626                         Integer.toString(position),
627                         Integer.toString(count)));
628     } // invalidByte(int,int,int,int)
629 
630     /***
631      * Throws an exception for invalid surrogate bits.
632      */
633     private void invalidSurrogate(int uuuuu) throws UTFDataFormatException {
634 
635         throw new UTFDataFormatException(
636                 Localizer.getMessage("jsp.error.xml.invalidHighSurrogate",
637                         Integer.toHexString(uuuuu)));
638     } // invalidSurrogate(int)
639 
640 } // class UTF8Reader