001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.Reader; 024import java.nio.ByteBuffer; 025import java.nio.CharBuffer; 026import java.nio.charset.Charset; 027import java.nio.charset.CharsetEncoder; 028import java.nio.charset.CoderResult; 029import java.nio.charset.CodingErrorAction; 030import java.util.Objects; 031 032import org.apache.commons.io.Charsets; 033import org.apache.commons.io.IOUtils; 034import org.apache.commons.io.build.AbstractStreamBuilder; 035import org.apache.commons.io.charset.CharsetEncoders; 036 037/** 038 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 039 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 040 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 041 * <p> 042 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 043 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 044 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 045 * {@link java.io.BufferedReader}. 046 * </p> 047 * <p> 048 * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2} 049 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 050 * </p> 051 * 052 * <pre> 053 * InputStream inputStream = ... 054 * Charset cs = ... 055 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 056 * ReaderInputStream in2 = new ReaderInputStream(reader, cs); 057 * </pre> 058 * <p> 059 * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes 060 * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 061 * pulls it from the underlying stream. 062 * </p> 063 * <p> 064 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 065 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 066 * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 067 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 068 * </p> 069 * <p> 070 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 071 * </p> 072 * <p> 073 * Instances of {@link ReaderInputStream} are not thread safe. 074 * </p> 075 * 076 * @see org.apache.commons.io.output.WriterOutputStream 077 * @since 2.0 078 */ 079public class ReaderInputStream extends InputStream { 080 081 /** 082 * Builds a new {@link ReaderInputStream} instance. 083 * <p> 084 * For example: 085 * </p> 086 * <pre>{@code 087 * ReaderInputStream s = ReaderInputStream.builder() 088 * .setPath(path) 089 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 090 * .get()} 091 * </pre> 092 * <p> 093 * @since 2.12.0 094 */ 095 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 096 097 private CharsetEncoder charsetEncoder = super.getCharset().newEncoder(); 098 099 /** 100 * Constructs a new instance. 101 * 102 * @throws UnsupportedOperationException if the origin cannot be converted to a Reader. 103 */ 104 @SuppressWarnings("resource") 105 @Override 106 public ReaderInputStream get() throws IOException { 107 return new ReaderInputStream(getOrigin().getReader(getCharset()), charsetEncoder, getBufferSize()); 108 } 109 110 @Override 111 public Builder setCharset(final Charset charset) { 112 charsetEncoder = charset.newEncoder(); 113 return super.setCharset(charset); 114 } 115 116 /** 117 * Sets the charset encoder. 118 * 119 * @param charsetEncoder the charset encoder. 120 * @return this 121 */ 122 public Builder setCharsetEncoder(final CharsetEncoder charsetEncoder) { 123 this.charsetEncoder = charsetEncoder; 124 super.setCharset(charsetEncoder.charset()); 125 return asThis(); 126 } 127 128 } 129 130 /** 131 * Constructs a new {@link Builder}. 132 * 133 * @return a new {@link Builder}. 134 * @since 2.12.0 135 */ 136 public static Builder builder() { 137 return new Builder(); 138 } 139 140 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 141 final float minRequired = minBufferSize(charsetEncoder); 142 if (bufferSize < minRequired) { 143 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 144 charsetEncoder.charset().displayName())); 145 } 146 return bufferSize; 147 } 148 149 static float minBufferSize(final CharsetEncoder charsetEncoder) { 150 return charsetEncoder.maxBytesPerChar() * 2; 151 } 152 153 private final Reader reader; 154 155 private final CharsetEncoder charsetEncoder; 156 157 /** 158 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 159 */ 160 private final CharBuffer encoderIn; 161 /** 162 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 163 * caller. 164 */ 165 private final ByteBuffer encoderOut; 166 167 private CoderResult lastCoderResult; 168 169 private boolean endOfInput; 170 171 /** 172 * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of 173 * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 174 * 175 * @param reader the target {@link Reader} 176 * @deprecated Use {@link ReaderInputStream#builder()} instead 177 */ 178 @Deprecated 179 public ReaderInputStream(final Reader reader) { 180 this(reader, Charset.defaultCharset()); 181 } 182 183 /** 184 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 185 * 186 * <p> 187 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 188 * </p> 189 * 190 * @param reader the target {@link Reader} 191 * @param charset the charset encoding 192 * @deprecated Use {@link ReaderInputStream#builder()} instead 193 */ 194 @Deprecated 195 public ReaderInputStream(final Reader reader, final Charset charset) { 196 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 197 } 198 199 /** 200 * Constructs a new {@link ReaderInputStream}. 201 * 202 * <p> 203 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 204 * </p> 205 * 206 * @param reader the target {@link Reader}. 207 * @param charset the charset encoding. 208 * @param bufferSize the size of the input buffer in number of characters. 209 * @deprecated Use {@link ReaderInputStream#builder()} instead 210 */ 211 @Deprecated 212 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 213 // @formatter:off 214 this(reader, 215 Charsets.toCharset(charset).newEncoder() 216 .onMalformedInput(CodingErrorAction.REPLACE) 217 .onUnmappableCharacter(CodingErrorAction.REPLACE), 218 bufferSize); 219 // @formatter:on 220 } 221 222 /** 223 * Constructs a new {@link ReaderInputStream}. 224 * 225 * <p> 226 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 227 * an encoder which had already been in use. 228 * </p> 229 * 230 * @param reader the target {@link Reader} 231 * @param charsetEncoder the charset encoder 232 * @since 2.1 233 * @deprecated Use {@link ReaderInputStream#builder()} instead 234 */ 235 @Deprecated 236 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 237 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 238 } 239 240 /** 241 * Constructs a new {@link ReaderInputStream}. 242 * 243 * <p> 244 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 245 * an encoder which had already been in use. 246 * </p> 247 * 248 * @param reader the target {@link Reader} 249 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 250 * @param bufferSize the size of the input buffer in number of characters 251 * @since 2.1 252 * @deprecated Use {@link ReaderInputStream#builder()} instead 253 */ 254 @Deprecated 255 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 256 this.reader = reader; 257 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 258 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 259 this.encoderIn.flip(); 260 this.encoderOut = ByteBuffer.allocate(128); 261 this.encoderOut.flip(); 262 } 263 264 /** 265 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 266 * 267 * <p> 268 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 269 * </p> 270 * 271 * @param reader the target {@link Reader} 272 * @param charsetName the name of the charset encoding 273 * @deprecated Use {@link ReaderInputStream#builder()} instead 274 */ 275 @Deprecated 276 public ReaderInputStream(final Reader reader, final String charsetName) { 277 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 278 } 279 280 /** 281 * Constructs a new {@link ReaderInputStream}. 282 * 283 * <p> 284 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 285 * </p> 286 * 287 * @param reader the target {@link Reader} 288 * @param charsetName the name of the charset encoding, null maps to the default Charset. 289 * @param bufferSize the size of the input buffer in number of characters 290 * @deprecated Use {@link ReaderInputStream#builder()} instead 291 */ 292 @Deprecated 293 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 294 this(reader, Charsets.toCharset(charsetName), bufferSize); 295 } 296 297 /** 298 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 299 * 300 * @throws IOException if an I/O error occurs. 301 */ 302 @Override 303 public void close() throws IOException { 304 reader.close(); 305 } 306 307 /** 308 * Fills the internal char buffer from the reader. 309 * 310 * @throws IOException If an I/O error occurs 311 */ 312 private void fillBuffer() throws IOException { 313 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 314 encoderIn.compact(); 315 final int position = encoderIn.position(); 316 // We don't use Reader#read(CharBuffer) here because it is more efficient 317 // to write directly to the underlying char array (the default implementation 318 // copies data to a temporary char array). 319 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 320 if (c == EOF) { 321 endOfInput = true; 322 } else { 323 encoderIn.position(position + c); 324 } 325 encoderIn.flip(); 326 } 327 encoderOut.compact(); 328 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 329 if (endOfInput) { 330 lastCoderResult = charsetEncoder.flush(encoderOut); 331 } 332 if (lastCoderResult.isError()) { 333 lastCoderResult.throwException(); 334 } 335 encoderOut.flip(); 336 } 337 338 /** 339 * Gets the CharsetEncoder. 340 * 341 * @return the CharsetEncoder. 342 */ 343 CharsetEncoder getCharsetEncoder() { 344 return charsetEncoder; 345 } 346 347 /** 348 * Reads a single byte. 349 * 350 * @return either the byte read or {@code -1} if the end of the stream has been reached 351 * @throws IOException if an I/O error occurs. 352 */ 353 @Override 354 public int read() throws IOException { 355 for (;;) { 356 if (encoderOut.hasRemaining()) { 357 return encoderOut.get() & 0xFF; 358 } 359 fillBuffer(); 360 if (endOfInput && !encoderOut.hasRemaining()) { 361 return EOF; 362 } 363 } 364 } 365 366 /** 367 * Reads the specified number of bytes into an array. 368 * 369 * @param b the byte array to read into 370 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 371 * @throws IOException if an I/O error occurs. 372 */ 373 @Override 374 public int read(final byte[] b) throws IOException { 375 return read(b, 0, b.length); 376 } 377 378 /** 379 * Reads the specified number of bytes into an array. 380 * 381 * @param array the byte array to read into 382 * @param off the offset to start reading bytes into 383 * @param len the number of bytes to read 384 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 385 * @throws IOException if an I/O error occurs. 386 */ 387 @Override 388 public int read(final byte[] array, int off, int len) throws IOException { 389 Objects.requireNonNull(array, "array"); 390 if (len < 0 || off < 0 || off + len > array.length) { 391 throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); 392 } 393 int read = 0; 394 if (len == 0) { 395 return 0; // Always return 0 if len == 0 396 } 397 while (len > 0) { 398 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 399 final int c = Math.min(encoderOut.remaining(), len); 400 encoderOut.get(array, off, c); 401 off += c; 402 len -= c; 403 read += c; 404 } else if (endOfInput) { // Already reach EOF in the last read 405 break; 406 } else { // Read again 407 fillBuffer(); 408 } 409 } 410 return read == 0 && endOfInput ? EOF : read; 411 } 412}