001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.Reader; 026import java.io.StringReader; 027import java.net.HttpURLConnection; 028import java.net.URL; 029import java.net.URLConnection; 030import java.nio.charset.Charset; 031import java.nio.charset.StandardCharsets; 032import java.nio.file.Files; 033import java.nio.file.Path; 034import java.text.MessageFormat; 035import java.util.Locale; 036import java.util.Objects; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039 040import org.apache.commons.io.ByteOrderMark; 041import org.apache.commons.io.Charsets; 042import org.apache.commons.io.IOUtils; 043import org.apache.commons.io.build.AbstractStreamBuilder; 044import org.apache.commons.io.function.IOConsumer; 045import org.apache.commons.io.output.XmlStreamWriter; 046 047/** 048 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream. 049 * <p> 050 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. 051 * </p> 052 * <p> 053 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100% 054 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers). 055 * </p> 056 * <p> 057 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors. 058 * </p> 059 * <p> 060 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML 061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining 062 * the character encoding of a feed</a>. 063 * </p> 064 * <p> 065 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under Apache License 2.0. 066 * </p> 067 * 068 * @see org.apache.commons.io.output.XmlStreamWriter 069 * @since 2.0 070 */ 071public class XmlStreamReader extends Reader { 072 073 /** 074 * Builds a new {@link XmlStreamWriter} instance. 075 * 076 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 077 * <p> 078 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 079 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 080 * </p> 081 * <p> 082 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 083 * </p> 084 * <p> 085 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 086 * </p> 087 * <p> 088 * Else if the XML prolog had a charset encoding that encoding is used. 089 * </p> 090 * <p> 091 * Else if the content type had a charset encoding that encoding is used. 092 * </p> 093 * <p> 094 * Else 'UTF-8' is used. 095 * </p> 096 * <p> 097 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 098 * </p> 099 * <p> 100 * For example: 101 * </p> 102 * 103 * <pre>{@code 104 * XmlStreamReader r = XmlStreamReader.builder() 105 * .setPath(path) 106 * .setCharset(StandardCharsets.UTF_8) 107 * .get()} 108 * </pre> 109 * <p> 110 * 111 * @since 2.12.0 112 */ 113 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> { 114 115 private boolean nullCharset = true; 116 private boolean lenient = true; 117 private String httpContentType; 118 119 /** 120 * Constructs a new instance. 121 * 122 * @throws UnsupportedOperationException if the origin cannot be converted to an InputStream. 123 */ 124 @SuppressWarnings("resource") 125 @Override 126 public XmlStreamReader get() throws IOException { 127 final String defaultEncoding = nullCharset ? null : getCharset().name(); 128 // @formatter:off 129 return httpContentType == null 130 ? new XmlStreamReader(getOrigin().getInputStream(), lenient, defaultEncoding) 131 : new XmlStreamReader(getOrigin().getInputStream(), httpContentType, lenient, defaultEncoding); 132 // @formatter:on 133 } 134 135 @Override 136 public Builder setCharset(final Charset charset) { 137 nullCharset = charset == null; 138 return super.setCharset(charset); 139 } 140 141 @Override 142 public Builder setCharset(final String charset) { 143 nullCharset = charset == null; 144 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault())); 145 } 146 147 public Builder setHttpContentType(final String httpContentType) { 148 this.httpContentType = httpContentType; 149 return this; 150 } 151 152 public Builder setLenient(final boolean lenient) { 153 this.lenient = lenient; 154 return this; 155 } 156 157 } 158 159 private static final String UTF_8 = StandardCharsets.UTF_8.name(); 160 161 private static final String US_ASCII = StandardCharsets.US_ASCII.name(); 162 163 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name(); 164 165 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name(); 166 167 private static final String UTF_32BE = "UTF-32BE"; 168 169 private static final String UTF_32LE = "UTF-32LE"; 170 171 private static final String UTF_16 = StandardCharsets.UTF_16.name(); 172 173 private static final String UTF_32 = "UTF-32"; 174 175 private static final String EBCDIC = "CP1047"; 176 177 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, 178 ByteOrderMark.UTF_32LE }; 179 180 /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */ 181 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 182 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 183 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 184 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 185 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) }; 186 187 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 188 189 /** 190 * Pattern capturing the encoding of the "xml" processing instruction. 191 */ 192 public static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE); 193 194 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 195 196 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 197 198 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 199 200 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 201 202 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME"; 203 204 /** 205 * Constructs a new {@link Builder}. 206 * 207 * @return a new {@link Builder}. 208 * @since 2.12.0 209 */ 210 public static Builder builder() { 211 return new Builder(); 212 } 213 214 /** 215 * Gets the charset parameter value, NULL if not present, NULL if httpContentType is NULL. 216 * 217 * @param httpContentType the HTTP content type 218 * @return The content type encoding (upcased) 219 */ 220 static String getContentTypeEncoding(final String httpContentType) { 221 String encoding = null; 222 if (httpContentType != null) { 223 final int i = httpContentType.indexOf(";"); 224 if (i > -1) { 225 final String postMime = httpContentType.substring(i + 1); 226 final Matcher m = CHARSET_PATTERN.matcher(postMime); 227 encoding = m.find() ? m.group(1) : null; 228 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; 229 } 230 } 231 return encoding; 232 } 233 234 /** 235 * Gets the MIME type or NULL if httpContentType is NULL. 236 * 237 * @param httpContentType the HTTP content type 238 * @return The mime content type 239 */ 240 static String getContentTypeMime(final String httpContentType) { 241 String mime = null; 242 if (httpContentType != null) { 243 final int i = httpContentType.indexOf(";"); 244 if (i >= 0) { 245 mime = httpContentType.substring(0, i); 246 } else { 247 mime = httpContentType; 248 } 249 mime = mime.trim(); 250 } 251 return mime; 252 } 253 254 /** 255 * Gets the encoding declared in the <?xml encoding=...?>, NULL if none. 256 * 257 * @param inputStream InputStream to create the reader from. 258 * @param guessedEnc guessed encoding 259 * @return the encoding declared in the <?xml encoding=...?> 260 * @throws IOException thrown if there is a problem reading the stream. 261 */ 262 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException { 263 String encoding = null; 264 if (guessedEnc != null) { 265 final byte[] bytes = IOUtils.byteArray(); 266 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE); 267 int offset = 0; 268 int max = IOUtils.DEFAULT_BUFFER_SIZE; 269 int c = inputStream.read(bytes, offset, max); 270 int firstGT = -1; 271 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 272 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) { 273 offset += c; 274 max -= c; 275 c = inputStream.read(bytes, offset, max); 276 xmlProlog = new String(bytes, 0, offset, guessedEnc); 277 firstGT = xmlProlog.indexOf('>'); 278 } 279 if (firstGT == -1) { 280 if (c == -1) { 281 throw new IOException("Unexpected end of XML stream"); 282 } 283 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes"); 284 } 285 final int bytesRead = offset; 286 if (bytesRead > 0) { 287 inputStream.reset(); 288 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1))); 289 final StringBuilder prolog = new StringBuilder(); 290 IOConsumer.forEach(bReader.lines(), prolog::append); 291 final Matcher m = ENCODING_PATTERN.matcher(prolog); 292 if (m.find()) { 293 encoding = m.group(1).toUpperCase(Locale.ROOT); 294 encoding = encoding.substring(1, encoding.length() - 1); 295 } 296 } 297 } 298 return encoding; 299 } 300 301 /** 302 * Tests if the MIME type belongs to the APPLICATION XML family. 303 * 304 * @param mime The mime type 305 * @return true if the mime type belongs to the APPLICATION XML family, otherwise false 306 */ 307 static boolean isAppXml(final String mime) { 308 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity") 309 || mime.startsWith("application/") && mime.endsWith("+xml")); 310 } 311 312 /** 313 * Tests if the MIME type belongs to the TEXT XML family. 314 * 315 * @param mime The mime type 316 * @return true if the mime type belongs to the TEXT XML family, otherwise false 317 */ 318 static boolean isTextXml(final String mime) { 319 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml")); 320 } 321 322 private final Reader reader; 323 324 private final String encoding; 325 326 private final String defaultEncoding; 327 328 /** 329 * Constructs a Reader for a File. 330 * <p> 331 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 332 * </p> 333 * <p> 334 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 335 * </p> 336 * 337 * @param file File to create a Reader from. 338 * @throws NullPointerException if the input is {@code null}. 339 * @throws IOException thrown if there is a problem reading the file. 340 * @deprecated Use {@link #builder()} 341 */ 342 @Deprecated 343 public XmlStreamReader(final File file) throws IOException { 344 this(Objects.requireNonNull(file, "file").toPath()); 345 } 346 347 /** 348 * Constructs a Reader for a raw InputStream. 349 * <p> 350 * It follows the same logic used for files. 351 * </p> 352 * <p> 353 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 354 * </p> 355 * 356 * @param inputStream InputStream to create a Reader from. 357 * @throws NullPointerException if the input stream is {@code null}. 358 * @throws IOException thrown if there is a problem reading the stream. 359 * @deprecated Use {@link #builder()} 360 */ 361 @Deprecated 362 public XmlStreamReader(final InputStream inputStream) throws IOException { 363 this(inputStream, true); 364 } 365 366 /** 367 * Constructs a Reader for a raw InputStream. 368 * <p> 369 * It follows the same logic used for files. 370 * </p> 371 * <p> 372 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 373 * </p> 374 * <p> 375 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 376 * </p> 377 * <p> 378 * Else if the XML prolog had a charset encoding that encoding is used. 379 * </p> 380 * <p> 381 * Else if the content type had a charset encoding that encoding is used. 382 * </p> 383 * <p> 384 * Else 'UTF-8' is used. 385 * </p> 386 * <p> 387 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 388 * </p> 389 * 390 * @param inputStream InputStream to create a Reader from. 391 * @param lenient indicates if the charset encoding detection should be relaxed. 392 * @throws NullPointerException if the input stream is {@code null}. 393 * @throws IOException thrown if there is a problem reading the stream. 394 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 395 * @deprecated Use {@link #builder()} 396 */ 397 @Deprecated 398 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException { 399 this(inputStream, lenient, null); 400 } 401 402 /** 403 * Constructs a Reader for a raw InputStream. 404 * <p> 405 * It follows the same logic used for files. 406 * </p> 407 * <p> 408 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 409 * </p> 410 * <p> 411 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 412 * </p> 413 * <p> 414 * Else if the XML prolog had a charset encoding that encoding is used. 415 * </p> 416 * <p> 417 * Else if the content type had a charset encoding that encoding is used. 418 * </p> 419 * <p> 420 * Else 'UTF-8' is used. 421 * </p> 422 * <p> 423 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 424 * </p> 425 * 426 * @param inputStream InputStream to create a Reader from. 427 * @param lenient indicates if the charset encoding detection should be relaxed. 428 * @param defaultEncoding The default encoding 429 * @throws NullPointerException if the input stream is {@code null}. 430 * @throws IOException thrown if there is a problem reading the stream. 431 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 432 * @deprecated Use {@link #builder()} 433 */ 434 @Deprecated 435 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 436 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException { 437 Objects.requireNonNull(inputStream, "inputStream"); 438 this.defaultEncoding = defaultEncoding; 439 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS); 440 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 441 this.encoding = doRawStream(bom, pis, lenient); 442 this.reader = new InputStreamReader(pis, encoding); 443 } 444 445 /** 446 * Constructs a Reader using an InputStream and the associated content-type header. 447 * <p> 448 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 449 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 450 * </p> 451 * <p> 452 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 453 * </p> 454 * 455 * @param inputStream InputStream to create the reader from. 456 * @param httpContentType content-type header to use for the resolution of the charset encoding. 457 * @throws NullPointerException if the input stream is {@code null}. 458 * @throws IOException thrown if there is a problem reading the file. 459 * @deprecated Use {@link #builder()} 460 */ 461 @Deprecated 462 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException { 463 this(inputStream, httpContentType, true); 464 } 465 466 /** 467 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 468 * <p> 469 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 470 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 471 * </p> 472 * <p> 473 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 474 * </p> 475 * <p> 476 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 477 * </p> 478 * <p> 479 * Else if the XML prolog had a charset encoding that encoding is used. 480 * </p> 481 * <p> 482 * Else if the content type had a charset encoding that encoding is used. 483 * </p> 484 * <p> 485 * Else 'UTF-8' is used. 486 * </p> 487 * <p> 488 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 489 * </p> 490 * 491 * @param inputStream InputStream to create the reader from. 492 * @param httpContentType content-type header to use for the resolution of the charset encoding. 493 * @param lenient indicates if the charset encoding detection should be relaxed. 494 * @throws NullPointerException if the input stream is {@code null}. 495 * @throws IOException thrown if there is a problem reading the file. 496 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 497 * @deprecated Use {@link #builder()} 498 */ 499 @Deprecated 500 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException { 501 this(inputStream, httpContentType, lenient, null); 502 } 503 504 /** 505 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 506 * <p> 507 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 508 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 509 * </p> 510 * <p> 511 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 512 * </p> 513 * <p> 514 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 515 * </p> 516 * <p> 517 * Else if the XML prolog had a charset encoding that encoding is used. 518 * </p> 519 * <p> 520 * Else if the content type had a charset encoding that encoding is used. 521 * </p> 522 * <p> 523 * Else 'UTF-8' is used. 524 * </p> 525 * <p> 526 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 527 * </p> 528 * 529 * @param inputStream InputStream to create the reader from. 530 * @param httpContentType content-type header to use for the resolution of the charset encoding. 531 * @param lenient indicates if the charset encoding detection should be relaxed. 532 * @param defaultEncoding The default encoding 533 * @throws NullPointerException if the input stream is {@code null}. 534 * @throws IOException thrown if there is a problem reading the file. 535 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 536 * @deprecated Use {@link #builder()} 537 */ 538 @Deprecated 539 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 540 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding) 541 throws IOException { 542 Objects.requireNonNull(inputStream, "inputStream"); 543 this.defaultEncoding = defaultEncoding; 544 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS); 545 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 546 this.encoding = processHttpStream(bom, pis, httpContentType, lenient); 547 this.reader = new InputStreamReader(pis, encoding); 548 } 549 550 /** 551 * Constructs a Reader for a File. 552 * <p> 553 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 554 * </p> 555 * <p> 556 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 557 * </p> 558 * 559 * @param file File to create a Reader from. 560 * @throws NullPointerException if the input is {@code null}. 561 * @throws IOException thrown if there is a problem reading the file. 562 * @since 2.11.0 563 * @deprecated Use {@link #builder()} 564 */ 565 @Deprecated 566 @SuppressWarnings("resource") // InputStream is managed through another reader in this instance. 567 public XmlStreamReader(final Path file) throws IOException { 568 this(Files.newInputStream(Objects.requireNonNull(file, "file"))); 569 } 570 571 /** 572 * Constructs a Reader using the InputStream of a URL. 573 * <p> 574 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files. 575 * </p> 576 * <p> 577 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type. 578 * </p> 579 * <p> 580 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 581 * </p> 582 * 583 * @param url URL to create a Reader from. 584 * @throws NullPointerException if the input is {@code null}. 585 * @throws IOException thrown if there is a problem reading the stream of the URL. 586 */ 587 public XmlStreamReader(final URL url) throws IOException { 588 this(Objects.requireNonNull(url, "url").openConnection(), null); 589 } 590 591 /** 592 * Constructs a Reader using the InputStream of a URLConnection. 593 * <p> 594 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files. 595 * </p> 596 * <p> 597 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with 598 * content-type. 599 * </p> 600 * <p> 601 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 602 * </p> 603 * 604 * @param urlConnection URLConnection to create a Reader from. 605 * @param defaultEncoding The default encoding 606 * @throws NullPointerException if the input is {@code null}. 607 * @throws IOException thrown if there is a problem reading the stream of the URLConnection. 608 */ 609 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException { 610 Objects.requireNonNull(urlConnection, "urlConnection"); 611 this.defaultEncoding = defaultEncoding; 612 final boolean lenient = true; 613 final String contentType = urlConnection.getContentType(); 614 final InputStream inputStream = urlConnection.getInputStream(); 615 @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance 616 // @formatter:off 617 final BOMInputStream bomInput = BOMInputStream.builder() 618 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE)) 619 .setInclude(false) 620 .setByteOrderMarks(BOMS) 621 .get(); 622 @SuppressWarnings("resource") 623 final BOMInputStream piInput = BOMInputStream.builder() 624 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE)) 625 .setInclude(true) 626 .setByteOrderMarks(XML_GUESS_BYTES) 627 .get(); 628 // @formatter:on 629 if (urlConnection instanceof HttpURLConnection || contentType != null) { 630 this.encoding = processHttpStream(bomInput, piInput, contentType, lenient); 631 } else { 632 this.encoding = doRawStream(bomInput, piInput, lenient); 633 } 634 this.reader = new InputStreamReader(piInput, encoding); 635 } 636 637 /** 638 * Calculates the HTTP encoding. 639 * 640 * @param httpContentType The HTTP content type 641 * @param bomEnc BOM encoding 642 * @param xmlGuessEnc XML Guess encoding 643 * @param xmlEnc XML encoding 644 * @param lenient indicates if the charset encoding detection should be relaxed. 645 * @return the HTTP encoding 646 * @throws IOException thrown if there is a problem reading the stream. 647 */ 648 String calculateHttpEncoding(final String httpContentType, final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient) 649 throws IOException { 650 651 // Lenient and has XML encoding 652 if (lenient && xmlEnc != null) { 653 return xmlEnc; 654 } 655 656 // Determine mime/encoding content types from HTTP Content Type 657 final String cTMime = getContentTypeMime(httpContentType); 658 final String cTEnc = getContentTypeEncoding(httpContentType); 659 final boolean appXml = isAppXml(cTMime); 660 final boolean textXml = isTextXml(cTMime); 661 662 // Mime type NOT "application/xml" or "text/xml" 663 if (!appXml && !textXml) { 664 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 665 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 666 } 667 668 // No content type encoding 669 if (cTEnc == null) { 670 if (appXml) { 671 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 672 } 673 return defaultEncoding == null ? US_ASCII : defaultEncoding; 674 } 675 676 // UTF-16BE or UTF-16LE content type encoding 677 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 678 if (bomEnc != null) { 679 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 680 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 681 } 682 return cTEnc; 683 } 684 685 // UTF-16 content type encoding 686 if (cTEnc.equals(UTF_16)) { 687 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 688 return bomEnc; 689 } 690 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 691 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 692 } 693 694 // UTF-32BE or UTF-132E content type encoding 695 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 696 if (bomEnc != null) { 697 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 698 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 699 } 700 return cTEnc; 701 } 702 703 // UTF-32 content type encoding 704 if (cTEnc.equals(UTF_32)) { 705 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 706 return bomEnc; 707 } 708 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 709 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 710 } 711 712 return cTEnc; 713 } 714 715 /** 716 * Calculate the raw encoding. 717 * 718 * @param bomEnc BOM encoding 719 * @param xmlGuessEnc XML Guess encoding 720 * @param xmlEnc XML encoding 721 * @return the raw encoding 722 * @throws IOException thrown if there is a problem reading the stream. 723 */ 724 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException { 725 726 // BOM is Null 727 if (bomEnc == null) { 728 if (xmlGuessEnc == null || xmlEnc == null) { 729 return defaultEncoding == null ? UTF_8 : defaultEncoding; 730 } 731 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 732 return xmlGuessEnc; 733 } 734 return xmlEnc; 735 } 736 737 // BOM is UTF-8 738 if (bomEnc.equals(UTF_8)) { 739 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 740 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 741 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 742 } 743 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 744 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 745 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 746 } 747 return bomEnc; 748 } 749 750 // BOM is UTF-16BE or UTF-16LE 751 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 752 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 753 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 754 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 755 } 756 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 757 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 758 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 759 } 760 return bomEnc; 761 } 762 763 // BOM is UTF-32BE or UTF-32LE 764 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 765 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 766 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 767 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 768 } 769 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 770 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 771 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 772 } 773 return bomEnc; 774 } 775 776 // BOM is something else 777 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc); 778 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 779 } 780 781 /** 782 * Closes the XmlStreamReader stream. 783 * 784 * @throws IOException thrown if there was a problem closing the stream. 785 */ 786 @Override 787 public void close() throws IOException { 788 reader.close(); 789 } 790 791 /** 792 * Does lenient detection. 793 * 794 * @param httpContentType content-type header to use for the resolution of the charset encoding. 795 * @param ex The thrown exception 796 * @return the encoding 797 * @throws IOException thrown if there is a problem reading the stream. 798 */ 799 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException { 800 if (httpContentType != null && httpContentType.startsWith("text/html")) { 801 httpContentType = httpContentType.substring("text/html".length()); 802 httpContentType = "text/xml" + httpContentType; 803 try { 804 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 805 } catch (final XmlStreamReaderException ex2) { 806 ex = ex2; 807 } 808 } 809 String encoding = ex.getXmlEncoding(); 810 if (encoding == null) { 811 encoding = ex.getContentTypeEncoding(); 812 } 813 if (encoding == null) { 814 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 815 } 816 return encoding; 817 } 818 819 /** 820 * Process the raw stream. 821 * 822 * @param bom BOMInputStream to detect byte order marks 823 * @param pis BOMInputStream to guess XML encoding 824 * @param lenient indicates if the charset encoding detection should be relaxed. 825 * @return the encoding to be used 826 * @throws IOException thrown if there is a problem reading the stream. 827 */ 828 private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient) throws IOException { 829 final String bomEnc = bom.getBOMCharsetName(); 830 final String xmlGuessEnc = pis.getBOMCharsetName(); 831 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 832 try { 833 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 834 } catch (final XmlStreamReaderException ex) { 835 if (lenient) { 836 return doLenientDetection(null, ex); 837 } 838 throw ex; 839 } 840 } 841 842 /** 843 * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate. 844 * <p> 845 * If it is NULL the content-type based rules are used. 846 * </p> 847 * 848 * @return the default encoding to use. 849 */ 850 public String getDefaultEncoding() { 851 return defaultEncoding; 852 } 853 854 /** 855 * Gets the charset encoding of the XmlStreamReader. 856 * 857 * @return charset encoding. 858 */ 859 public String getEncoding() { 860 return encoding; 861 } 862 863 /** 864 * Processes an HTTP stream. 865 * 866 * @param bomInput BOMInputStream to detect byte order marks 867 * @param piInput BOMInputStream to guess XML encoding 868 * @param httpContentType The HTTP content type 869 * @param lenient indicates if the charset encoding detection should be relaxed. 870 * @return the encoding to be used 871 * @throws IOException thrown if there is a problem reading the stream. 872 */ 873 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final String httpContentType, final boolean lenient) 874 throws IOException { 875 final String bomEnc = bomInput.getBOMCharsetName(); 876 final String xmlGuessEnc = piInput.getBOMCharsetName(); 877 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc); 878 try { 879 return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient); 880 } catch (final XmlStreamReaderException ex) { 881 if (lenient) { 882 return doLenientDetection(httpContentType, ex); 883 } 884 throw ex; 885 } 886 } 887 888 /** 889 * Reads the underlying reader's {@code read(char[], int, int)} method. 890 * 891 * @param buf the buffer to read the characters into 892 * @param offset The start offset 893 * @param len The number of bytes to read 894 * @return the number of characters read or -1 if the end of stream 895 * @throws IOException if an I/O error occurs. 896 */ 897 @Override 898 public int read(final char[] buf, final int offset, final int len) throws IOException { 899 return reader.read(buf, offset, len); 900 } 901 902}