001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.csv; 019 020import static org.apache.commons.csv.Token.Type.TOKEN; 021 022import java.io.Closeable; 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.Reader; 029import java.io.StringReader; 030import java.net.URL; 031import java.nio.charset.Charset; 032import java.nio.file.Files; 033import java.nio.file.Path; 034import java.util.ArrayList; 035import java.util.Arrays; 036import java.util.Iterator; 037import java.util.LinkedHashMap; 038import java.util.List; 039import java.util.Map; 040import java.util.NoSuchElementException; 041import java.util.TreeMap; 042 043/** 044 * Parses CSV files according to the specified format. 045 * 046 * Because CSV appears in many different dialects, the parser supports many formats by allowing the 047 * specification of a {@link CSVFormat}. 048 * 049 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. 050 * 051 * <h2>Creating instances</h2> 052 * <p> 053 * There are several static factory methods that can be used to create instances for various types of resources: 054 * </p> 055 * <ul> 056 * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> 057 * <li>{@link #parse(String, CSVFormat)}</li> 058 * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> 059 * </ul> 060 * <p> 061 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. 062 * 063 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: 064 * </p> 065 * <pre> 066 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) { 067 * ... 068 * } 069 * </pre> 070 * 071 * <h2>Parsing record wise</h2> 072 * <p> 073 * To parse a CSV input from a file, you write: 074 * </p> 075 * 076 * <pre> 077 * File csvData = new File("/path/to/csv"); 078 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); 079 * for (CSVRecord csvRecord : parser) { 080 * ... 081 * } 082 * </pre> 083 * 084 * <p> 085 * This will read the parse the contents of the file using the 086 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. 087 * </p> 088 * 089 * <p> 090 * To parse CSV input in a format like Excel, you write: 091 * </p> 092 * 093 * <pre> 094 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); 095 * for (CSVRecord csvRecord : parser) { 096 * ... 097 * } 098 * </pre> 099 * 100 * <p> 101 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about 102 * customising CSVFormats is available in {@link CSVFormat CSVFormat JavaDoc}. 103 * </p> 104 * 105 * <h2>Parsing into memory</h2> 106 * <p> 107 * If parsing record wise is not desired, the contents of the input can be read completely into memory. 108 * </p> 109 * 110 * <pre> 111 * Reader in = new StringReader("a;b\nc;d"); 112 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); 113 * List<CSVRecord> list = parser.getRecords(); 114 * </pre> 115 * 116 * <p> 117 * There are two constraints that have to be kept in mind: 118 * </p> 119 * 120 * <ol> 121 * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from 122 * the input, those records will not end up in the in memory representation of your CSV data.</li> 123 * <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're 124 * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> 125 * </ol> 126 * 127 * <h2>Notes</h2> 128 * <p> 129 * Internal parser state is completely covered by the format and the reader-state. 130 * </p> 131 * 132 * @see <a href="package-summary.html">package documentation for more details</a> 133 */ 134public final class CSVParser implements Iterable<CSVRecord>, Closeable { 135 136 /** 137 * Creates a parser for the given {@link File}. 138 * 139 * @param file 140 * a CSV file. Must not be null. 141 * @param charset 142 * A Charset 143 * @param format 144 * the CSVFormat used for CSV parsing. Must not be null. 145 * @return a new parser 146 * @throws IllegalArgumentException 147 * If the parameters of the format are inconsistent or if either file or format are null. 148 * @throws IOException 149 * If an I/O error occurs 150 */ 151 @SuppressWarnings("resource") 152 public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { 153 Assertions.notNull(file, "file"); 154 Assertions.notNull(format, "format"); 155 return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format); 156 } 157 158 /** 159 * Creates a CSV parser using the given {@link CSVFormat}. 160 * 161 * <p> 162 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 163 * unless you close the {@code reader}. 164 * </p> 165 * 166 * @param inputStream 167 * an InputStream containing CSV-formatted input. Must not be null. 168 * @param charset 169 * a Charset. 170 * @param format 171 * the CSVFormat used for CSV parsing. Must not be null. 172 * @return a new CSVParser configured with the given reader and format. 173 * @throws IllegalArgumentException 174 * If the parameters of the format are inconsistent or if either reader or format are null. 175 * @throws IOException 176 * If there is a problem reading the header or skipping the first record 177 * @since 1.5 178 */ 179 @SuppressWarnings("resource") 180 public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) 181 throws IOException { 182 Assertions.notNull(inputStream, "inputStream"); 183 Assertions.notNull(format, "format"); 184 return parse(new InputStreamReader(inputStream, charset), format); 185 } 186 187 /** 188 * Creates a parser for the given {@link Path}. 189 * 190 * @param path 191 * a CSV file. Must not be null. 192 * @param charset 193 * A Charset 194 * @param format 195 * the CSVFormat used for CSV parsing. Must not be null. 196 * @return a new parser 197 * @throws IllegalArgumentException 198 * If the parameters of the format are inconsistent or if either file or format are null. 199 * @throws IOException 200 * If an I/O error occurs 201 * @since 1.5 202 */ 203 public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { 204 Assertions.notNull(path, "path"); 205 Assertions.notNull(format, "format"); 206 return parse(Files.newBufferedReader(path, charset), format); 207 } 208 209 /** 210 * Creates a CSV parser using the given {@link CSVFormat} 211 * 212 * <p> 213 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 214 * unless you close the {@code reader}. 215 * </p> 216 * 217 * @param reader 218 * a Reader containing CSV-formatted input. Must not be null. 219 * @param format 220 * the CSVFormat used for CSV parsing. Must not be null. 221 * @return a new CSVParser configured with the given reader and format. 222 * @throws IllegalArgumentException 223 * If the parameters of the format are inconsistent or if either reader or format are null. 224 * @throws IOException 225 * If there is a problem reading the header or skipping the first record 226 * @since 1.5 227 */ 228 public static CSVParser parse(Reader reader, final CSVFormat format) throws IOException { 229 return new CSVParser(reader, format); 230 } 231 232 /** 233 * Creates a parser for the given {@link String}. 234 * 235 * @param string 236 * a CSV string. Must not be null. 237 * @param format 238 * the CSVFormat used for CSV parsing. Must not be null. 239 * @return a new parser 240 * @throws IllegalArgumentException 241 * If the parameters of the format are inconsistent or if either string or format are null. 242 * @throws IOException 243 * If an I/O error occurs 244 */ 245 public static CSVParser parse(final String string, final CSVFormat format) throws IOException { 246 Assertions.notNull(string, "string"); 247 Assertions.notNull(format, "format"); 248 249 return new CSVParser(new StringReader(string), format); 250 } 251 252 /** 253 * Creates a parser for the given URL. 254 * 255 * <p> 256 * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless 257 * you close the {@code url}. 258 * </p> 259 * 260 * @param url 261 * a URL. Must not be null. 262 * @param charset 263 * the charset for the resource. Must not be null. 264 * @param format 265 * the CSVFormat used for CSV parsing. Must not be null. 266 * @return a new parser 267 * @throws IllegalArgumentException 268 * If the parameters of the format are inconsistent or if either url, charset or format are null. 269 * @throws IOException 270 * If an I/O error occurs 271 */ 272 public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { 273 Assertions.notNull(url, "url"); 274 Assertions.notNull(charset, "charset"); 275 Assertions.notNull(format, "format"); 276 277 return new CSVParser(new InputStreamReader(url.openStream(), charset), format); 278 } 279 280 // the following objects are shared to reduce garbage 281 282 private final CSVFormat format; 283 284 /** A mapping of column names to column indices */ 285 private final Map<String, Integer> headerMap; 286 287 private final Lexer lexer; 288 289 /** A record buffer for getRecord(). Grows as necessary and is reused. */ 290 private final List<String> recordList = new ArrayList<>(); 291 292 /** 293 * The next record number to assign. 294 */ 295 private long recordNumber; 296 297 /** 298 * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination 299 * with {@link #recordNumber}. 300 */ 301 private final long characterOffset; 302 303 private final Token reusableToken = new Token(); 304 305 /** 306 * Customized CSV parser using the given {@link CSVFormat} 307 * 308 * <p> 309 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 310 * unless you close the {@code reader}. 311 * </p> 312 * 313 * @param reader 314 * a Reader containing CSV-formatted input. Must not be null. 315 * @param format 316 * the CSVFormat used for CSV parsing. Must not be null. 317 * @throws IllegalArgumentException 318 * If the parameters of the format are inconsistent or if either reader or format are null. 319 * @throws IOException 320 * If there is a problem reading the header or skipping the first record 321 */ 322 public CSVParser(final Reader reader, final CSVFormat format) throws IOException { 323 this(reader, format, 0, 1); 324 } 325 326 /** 327 * Customized CSV parser using the given {@link CSVFormat} 328 * 329 * <p> 330 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 331 * unless you close the {@code reader}. 332 * </p> 333 * 334 * @param reader 335 * a Reader containing CSV-formatted input. Must not be null. 336 * @param format 337 * the CSVFormat used for CSV parsing. Must not be null. 338 * @param characterOffset 339 * Lexer offset when the parser does not start parsing at the beginning of the source. 340 * @param recordNumber 341 * The next record number to assign 342 * @throws IllegalArgumentException 343 * If the parameters of the format are inconsistent or if either reader or format are null. 344 * @throws IOException 345 * If there is a problem reading the header or skipping the first record 346 * @since 1.1 347 */ 348 @SuppressWarnings("resource") 349 public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) 350 throws IOException { 351 Assertions.notNull(reader, "reader"); 352 Assertions.notNull(format, "format"); 353 354 this.format = format; 355 this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); 356 this.headerMap = this.initializeHeader(); 357 this.characterOffset = characterOffset; 358 this.recordNumber = recordNumber - 1; 359 } 360 361 private void addRecordValue(final boolean lastRecord) { 362 final String input = this.reusableToken.content.toString(); 363 final String inputClean = this.format.getTrim() ? input.trim() : input; 364 if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { 365 return; 366 } 367 final String nullString = this.format.getNullString(); 368 this.recordList.add(inputClean.equals(nullString) ? null : inputClean); 369 } 370 371 /** 372 * Closes resources. 373 * 374 * @throws IOException 375 * If an I/O error occurs 376 */ 377 @Override 378 public void close() throws IOException { 379 if (this.lexer != null) { 380 this.lexer.close(); 381 } 382 } 383 384 /** 385 * Returns the current line number in the input stream. 386 * 387 * <p> 388 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 389 * the record number. 390 * </p> 391 * 392 * @return current line number 393 */ 394 public long getCurrentLineNumber() { 395 return this.lexer.getCurrentLineNumber(); 396 } 397 398 /** 399 * Gets the first end-of-line string encountered. 400 * 401 * @return the first end-of-line string 402 * @since 1.5 403 */ 404 public String getFirstEndOfLine() { 405 return lexer.getFirstEol(); 406 } 407 408 /** 409 * Returns a copy of the header map that iterates in column order. 410 * <p> 411 * The map keys are column names. The map values are 0-based indices. 412 * </p> 413 * @return a copy of the header map that iterates in column order. 414 */ 415 public Map<String, Integer> getHeaderMap() { 416 return this.headerMap == null ? null : new LinkedHashMap<>(this.headerMap); 417 } 418 419 /** 420 * Returns the current record number in the input stream. 421 * 422 * <p> 423 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 424 * the line number. 425 * </p> 426 * 427 * @return current record number 428 */ 429 public long getRecordNumber() { 430 return this.recordNumber; 431 } 432 433 /** 434 * Parses the CSV input according to the given format and returns the content as a list of 435 * {@link CSVRecord CSVRecords}. 436 * 437 * <p> 438 * The returned content starts at the current parse-position in the stream. 439 * </p> 440 * 441 * @return list of {@link CSVRecord CSVRecords}, may be empty 442 * @throws IOException 443 * on parse error or input read-failure 444 */ 445 public List<CSVRecord> getRecords() throws IOException { 446 CSVRecord rec; 447 final List<CSVRecord> records = new ArrayList<>(); 448 while ((rec = this.nextRecord()) != null) { 449 records.add(rec); 450 } 451 return records; 452 } 453 454 /** 455 * Initializes the name to index mapping if the format defines a header. 456 * 457 * @return null if the format has no header. 458 * @throws IOException if there is a problem reading the header or skipping the first record 459 */ 460 private Map<String, Integer> initializeHeader() throws IOException { 461 Map<String, Integer> hdrMap = null; 462 final String[] formatHeader = this.format.getHeader(); 463 if (formatHeader != null) { 464 hdrMap = this.format.getIgnoreHeaderCase() ? 465 new TreeMap<String, Integer>(String.CASE_INSENSITIVE_ORDER) : 466 new LinkedHashMap<String, Integer>(); 467 468 String[] headerRecord = null; 469 if (formatHeader.length == 0) { 470 // read the header from the first line of the file 471 final CSVRecord nextRecord = this.nextRecord(); 472 if (nextRecord != null) { 473 headerRecord = nextRecord.values(); 474 } 475 } else { 476 if (this.format.getSkipHeaderRecord()) { 477 this.nextRecord(); 478 } 479 headerRecord = formatHeader; 480 } 481 482 // build the name to index mappings 483 if (headerRecord != null) { 484 for (int i = 0; i < headerRecord.length; i++) { 485 final String header = headerRecord[i]; 486 final boolean containsHeader = hdrMap.containsKey(header); 487 final boolean emptyHeader = header == null || header.trim().isEmpty(); 488 if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) { 489 throw new IllegalArgumentException("The header contains a duplicate name: \"" + header + 490 "\" in " + Arrays.toString(headerRecord)); 491 } 492 hdrMap.put(header, Integer.valueOf(i)); 493 } 494 } 495 } 496 return hdrMap; 497 } 498 499 /** 500 * Gets whether this parser is closed. 501 * 502 * @return whether this parser is closed. 503 */ 504 public boolean isClosed() { 505 return this.lexer.isClosed(); 506 } 507 508 /** 509 * Returns an iterator on the records. 510 * 511 * <p> 512 * An {@link IOException} caught during the iteration are re-thrown as an 513 * {@link IllegalStateException}. 514 * </p> 515 * <p> 516 * If the parser is closed a call to {@link Iterator#next()} will throw a 517 * {@link NoSuchElementException}. 518 * </p> 519 */ 520 @Override 521 public Iterator<CSVRecord> iterator() { 522 return new Iterator<CSVRecord>() { 523 private CSVRecord current; 524 525 private CSVRecord getNextRecord() { 526 try { 527 return CSVParser.this.nextRecord(); 528 } catch (final IOException e) { 529 throw new IllegalStateException( 530 e.getClass().getSimpleName() + " reading next record: " + e.toString(), e); 531 } 532 } 533 534 @Override 535 public boolean hasNext() { 536 if (CSVParser.this.isClosed()) { 537 return false; 538 } 539 if (this.current == null) { 540 this.current = this.getNextRecord(); 541 } 542 543 return this.current != null; 544 } 545 546 @Override 547 public CSVRecord next() { 548 if (CSVParser.this.isClosed()) { 549 throw new NoSuchElementException("CSVParser has been closed"); 550 } 551 CSVRecord next = this.current; 552 this.current = null; 553 554 if (next == null) { 555 // hasNext() wasn't called before 556 next = this.getNextRecord(); 557 if (next == null) { 558 throw new NoSuchElementException("No more CSV records available"); 559 } 560 } 561 562 return next; 563 } 564 565 @Override 566 public void remove() { 567 throw new UnsupportedOperationException(); 568 } 569 }; 570 } 571 572 /** 573 * Parses the next record from the current point in the stream. 574 * 575 * @return the record as an array of values, or {@code null} if the end of the stream has been reached 576 * @throws IOException 577 * on parse error or input read-failure 578 */ 579 CSVRecord nextRecord() throws IOException { 580 CSVRecord result = null; 581 this.recordList.clear(); 582 StringBuilder sb = null; 583 final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset; 584 do { 585 this.reusableToken.reset(); 586 this.lexer.nextToken(this.reusableToken); 587 switch (this.reusableToken.type) { 588 case TOKEN: 589 this.addRecordValue(false); 590 break; 591 case EORECORD: 592 this.addRecordValue(true); 593 break; 594 case EOF: 595 if (this.reusableToken.isReady) { 596 this.addRecordValue(true); 597 } 598 break; 599 case INVALID: 600 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); 601 case COMMENT: // Ignored currently 602 if (sb == null) { // first comment for this record 603 sb = new StringBuilder(); 604 } else { 605 sb.append(Constants.LF); 606 } 607 sb.append(this.reusableToken.content); 608 this.reusableToken.type = TOKEN; // Read another token 609 break; 610 default: 611 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); 612 } 613 } while (this.reusableToken.type == TOKEN); 614 615 if (!this.recordList.isEmpty()) { 616 this.recordNumber++; 617 final String comment = sb == null ? null : sb.toString(); 618 result = new CSVRecord(this.recordList.toArray(new String[this.recordList.size()]), this.headerMap, comment, 619 this.recordNumber, startCharPosition); 620 } 621 return result; 622 } 623 624}