001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.csv; 019 020import static org.apache.commons.csv.Token.Type.TOKEN; 021 022import java.io.Closeable; 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.Reader; 029import java.io.StringReader; 030import java.net.URL; 031import java.nio.charset.Charset; 032import java.nio.file.Files; 033import java.nio.file.Path; 034import java.util.ArrayList; 035import java.util.Arrays; 036import java.util.Collections; 037import java.util.Iterator; 038import java.util.LinkedHashMap; 039import java.util.List; 040import java.util.Map; 041import java.util.NoSuchElementException; 042import java.util.TreeMap; 043 044/** 045 * Parses CSV files according to the specified format. 046 * 047 * Because CSV appears in many different dialects, the parser supports many formats by allowing the 048 * specification of a {@link CSVFormat}. 049 * 050 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. 051 * 052 * <h2>Creating instances</h2> 053 * <p> 054 * There are several static factory methods that can be used to create instances for various types of resources: 055 * </p> 056 * <ul> 057 * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> 058 * <li>{@link #parse(String, CSVFormat)}</li> 059 * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> 060 * </ul> 061 * <p> 062 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. 063 * 064 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: 065 * </p> 066 * <pre> 067 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) { 068 * ... 069 * } 070 * </pre> 071 * 072 * <h2>Parsing record wise</h2> 073 * <p> 074 * To parse a CSV input from a file, you write: 075 * </p> 076 * 077 * <pre> 078 * File csvData = new File("/path/to/csv"); 079 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); 080 * for (CSVRecord csvRecord : parser) { 081 * ... 082 * } 083 * </pre> 084 * 085 * <p> 086 * This will read the parse the contents of the file using the 087 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. 088 * </p> 089 * 090 * <p> 091 * To parse CSV input in a format like Excel, you write: 092 * </p> 093 * 094 * <pre> 095 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); 096 * for (CSVRecord csvRecord : parser) { 097 * ... 098 * } 099 * </pre> 100 * 101 * <p> 102 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about 103 * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. 104 * </p> 105 * 106 * <h2>Parsing into memory</h2> 107 * <p> 108 * If parsing record wise is not desired, the contents of the input can be read completely into memory. 109 * </p> 110 * 111 * <pre> 112 * Reader in = new StringReader("a;b\nc;d"); 113 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); 114 * List<CSVRecord> list = parser.getRecords(); 115 * </pre> 116 * 117 * <p> 118 * There are two constraints that have to be kept in mind: 119 * </p> 120 * 121 * <ol> 122 * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from 123 * the input, those records will not end up in the in memory representation of your CSV data.</li> 124 * <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're 125 * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> 126 * </ol> 127 * 128 * <h2>Notes</h2> 129 * <p> 130 * Internal parser state is completely covered by the format and the reader-state. 131 * </p> 132 * 133 * @see <a href="package-summary.html">package documentation for more details</a> 134 */ 135public final class CSVParser implements Iterable<CSVRecord>, Closeable { 136 137 class CSVRecordIterator implements Iterator<CSVRecord> { 138 private CSVRecord current; 139 140 private CSVRecord getNextRecord() { 141 try { 142 return CSVParser.this.nextRecord(); 143 } catch (final IOException e) { 144 throw new IllegalStateException( 145 e.getClass().getSimpleName() + " reading next record: " + e.toString(), e); 146 } 147 } 148 149 @Override 150 public boolean hasNext() { 151 if (CSVParser.this.isClosed()) { 152 return false; 153 } 154 if (this.current == null) { 155 this.current = this.getNextRecord(); 156 } 157 158 return this.current != null; 159 } 160 161 @Override 162 public CSVRecord next() { 163 if (CSVParser.this.isClosed()) { 164 throw new NoSuchElementException("CSVParser has been closed"); 165 } 166 CSVRecord next = this.current; 167 this.current = null; 168 169 if (next == null) { 170 // hasNext() wasn't called before 171 next = this.getNextRecord(); 172 if (next == null) { 173 throw new NoSuchElementException("No more CSV records available"); 174 } 175 } 176 177 return next; 178 } 179 180 @Override 181 public void remove() { 182 throw new UnsupportedOperationException(); 183 } 184 } 185 186 /** 187 * Creates a parser for the given {@link File}. 188 * 189 * @param file 190 * a CSV file. Must not be null. 191 * @param charset 192 * A Charset 193 * @param format 194 * the CSVFormat used for CSV parsing. Must not be null. 195 * @return a new parser 196 * @throws IllegalArgumentException 197 * If the parameters of the format are inconsistent or if either file or format are null. 198 * @throws IOException 199 * If an I/O error occurs 200 */ 201 @SuppressWarnings("resource") 202 public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { 203 Assertions.notNull(file, "file"); 204 Assertions.notNull(format, "format"); 205 return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format); 206 } 207 208 /** 209 * Creates a CSV parser using the given {@link CSVFormat}. 210 * 211 * <p> 212 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 213 * unless you close the {@code reader}. 214 * </p> 215 * 216 * @param inputStream 217 * an InputStream containing CSV-formatted input. Must not be null. 218 * @param charset 219 * a Charset. 220 * @param format 221 * the CSVFormat used for CSV parsing. Must not be null. 222 * @return a new CSVParser configured with the given reader and format. 223 * @throws IllegalArgumentException 224 * If the parameters of the format are inconsistent or if either reader or format are null. 225 * @throws IOException 226 * If there is a problem reading the header or skipping the first record 227 * @since 1.5 228 */ 229 @SuppressWarnings("resource") 230 public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) 231 throws IOException { 232 Assertions.notNull(inputStream, "inputStream"); 233 Assertions.notNull(format, "format"); 234 return parse(new InputStreamReader(inputStream, charset), format); 235 } 236 237 /** 238 * Creates a parser for the given {@link Path}. 239 * 240 * @param path 241 * a CSV file. Must not be null. 242 * @param charset 243 * A Charset 244 * @param format 245 * the CSVFormat used for CSV parsing. Must not be null. 246 * @return a new parser 247 * @throws IllegalArgumentException 248 * If the parameters of the format are inconsistent or if either file or format are null. 249 * @throws IOException 250 * If an I/O error occurs 251 * @since 1.5 252 */ 253 public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { 254 Assertions.notNull(path, "path"); 255 Assertions.notNull(format, "format"); 256 return parse(Files.newInputStream(path), charset, format); 257 } 258 259 /** 260 * Creates a CSV parser using the given {@link CSVFormat} 261 * 262 * <p> 263 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 264 * unless you close the {@code reader}. 265 * </p> 266 * 267 * @param reader 268 * a Reader containing CSV-formatted input. Must not be null. 269 * @param format 270 * the CSVFormat used for CSV parsing. Must not be null. 271 * @return a new CSVParser configured with the given reader and format. 272 * @throws IllegalArgumentException 273 * If the parameters of the format are inconsistent or if either reader or format are null. 274 * @throws IOException 275 * If there is a problem reading the header or skipping the first record 276 * @since 1.5 277 */ 278 public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { 279 return new CSVParser(reader, format); 280 } 281 282 /** 283 * Creates a parser for the given {@link String}. 284 * 285 * @param string 286 * a CSV string. Must not be null. 287 * @param format 288 * the CSVFormat used for CSV parsing. Must not be null. 289 * @return a new parser 290 * @throws IllegalArgumentException 291 * If the parameters of the format are inconsistent or if either string or format are null. 292 * @throws IOException 293 * If an I/O error occurs 294 */ 295 public static CSVParser parse(final String string, final CSVFormat format) throws IOException { 296 Assertions.notNull(string, "string"); 297 Assertions.notNull(format, "format"); 298 299 return new CSVParser(new StringReader(string), format); 300 } 301 302 // the following objects are shared to reduce garbage 303 304 /** 305 * Creates a parser for the given URL. 306 * 307 * <p> 308 * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless 309 * you close the {@code url}. 310 * </p> 311 * 312 * @param url 313 * a URL. Must not be null. 314 * @param charset 315 * the charset for the resource. Must not be null. 316 * @param format 317 * the CSVFormat used for CSV parsing. Must not be null. 318 * @return a new parser 319 * @throws IllegalArgumentException 320 * If the parameters of the format are inconsistent or if either url, charset or format are null. 321 * @throws IOException 322 * If an I/O error occurs 323 */ 324 public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { 325 Assertions.notNull(url, "url"); 326 Assertions.notNull(charset, "charset"); 327 Assertions.notNull(format, "format"); 328 329 return new CSVParser(new InputStreamReader(url.openStream(), charset), format); 330 } 331 332 private final CSVFormat format; 333 334 /** A mapping of column names to column indices */ 335 private final Map<String, Integer> headerMap; 336 337 /** The column order to avoid re-computing it. */ 338 private final List<String> headerNames; 339 340 private final Lexer lexer; 341 342 private final CSVRecordIterator csvRecordIterator; 343 344 /** A record buffer for getRecord(). Grows as necessary and is reused. */ 345 private final List<String> recordList = new ArrayList<>(); 346 347 /** 348 * The next record number to assign. 349 */ 350 private long recordNumber; 351 352 /** 353 * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination 354 * with {@link #recordNumber}. 355 */ 356 private final long characterOffset; 357 358 private final Token reusableToken = new Token(); 359 360 /** 361 * Customized CSV parser using the given {@link CSVFormat} 362 * 363 * <p> 364 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 365 * unless you close the {@code reader}. 366 * </p> 367 * 368 * @param reader 369 * a Reader containing CSV-formatted input. Must not be null. 370 * @param format 371 * the CSVFormat used for CSV parsing. Must not be null. 372 * @throws IllegalArgumentException 373 * If the parameters of the format are inconsistent or if either reader or format are null. 374 * @throws IOException 375 * If there is a problem reading the header or skipping the first record 376 */ 377 public CSVParser(final Reader reader, final CSVFormat format) throws IOException { 378 this(reader, format, 0, 1); 379 } 380 381 /** 382 * Customized CSV parser using the given {@link CSVFormat} 383 * 384 * <p> 385 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 386 * unless you close the {@code reader}. 387 * </p> 388 * 389 * @param reader 390 * a Reader containing CSV-formatted input. Must not be null. 391 * @param format 392 * the CSVFormat used for CSV parsing. Must not be null. 393 * @param characterOffset 394 * Lexer offset when the parser does not start parsing at the beginning of the source. 395 * @param recordNumber 396 * The next record number to assign 397 * @throws IllegalArgumentException 398 * If the parameters of the format are inconsistent or if either reader or format are null. 399 * @throws IOException 400 * If there is a problem reading the header or skipping the first record 401 * @since 1.1 402 */ 403 @SuppressWarnings("resource") 404 public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) 405 throws IOException { 406 Assertions.notNull(reader, "reader"); 407 Assertions.notNull(format, "format"); 408 409 this.format = format; 410 this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); 411 this.csvRecordIterator = new CSVRecordIterator(); 412 final Headers headers = createHeaders(); 413 this.headerMap = headers.headerMap; 414 this.headerNames = headers.headerNames; 415 this.characterOffset = characterOffset; 416 this.recordNumber = recordNumber - 1; 417 } 418 419 private void addRecordValue(final boolean lastRecord) { 420 final String input = this.reusableToken.content.toString(); 421 final String inputClean = this.format.getTrim() ? input.trim() : input; 422 if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { 423 return; 424 } 425 final String nullString = this.format.getNullString(); 426 this.recordList.add(inputClean.equals(nullString) ? null : inputClean); 427 } 428 429 /** 430 * Closes resources. 431 * 432 * @throws IOException 433 * If an I/O error occurs 434 */ 435 @Override 436 public void close() throws IOException { 437 if (this.lexer != null) { 438 this.lexer.close(); 439 } 440 } 441 442 private Map<String, Integer> createEmptyHeaderMap() { 443 return this.format.getIgnoreHeaderCase() ? 444 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : 445 new LinkedHashMap<>(); 446 } 447 448 /** 449 * Header information based on name and position. 450 */ 451 private static final class Headers { 452 /** 453 * Header column positions (0-based) 454 */ 455 final Map<String, Integer> headerMap; 456 457 /** 458 * Header names in column order 459 */ 460 final List<String> headerNames; 461 462 Headers(final Map<String, Integer> headerMap, final List<String> headerNames) { 463 this.headerMap = headerMap; 464 this.headerNames = headerNames; 465 } 466 } 467 468 /** 469 * Creates the name to index mapping if the format defines a header. 470 * 471 * @return null if the format has no header. 472 * @throws IOException if there is a problem reading the header or skipping the first record 473 */ 474 private Headers createHeaders() throws IOException { 475 Map<String, Integer> hdrMap = null; 476 List<String> headerNames = null; 477 final String[] formatHeader = this.format.getHeader(); 478 if (formatHeader != null) { 479 hdrMap = createEmptyHeaderMap(); 480 String[] headerRecord = null; 481 if (formatHeader.length == 0) { 482 // read the header from the first line of the file 483 final CSVRecord nextRecord = this.nextRecord(); 484 if (nextRecord != null) { 485 headerRecord = nextRecord.values(); 486 } 487 } else { 488 if (this.format.getSkipHeaderRecord()) { 489 this.nextRecord(); 490 } 491 headerRecord = formatHeader; 492 } 493 494 // build the name to index mappings 495 if (headerRecord != null) { 496 for (int i = 0; i < headerRecord.length; i++) { 497 final String header = headerRecord[i]; 498 final boolean containsHeader = header == null ? false : hdrMap.containsKey(header); 499 final boolean emptyHeader = header == null || header.trim().isEmpty(); 500 if (containsHeader) { 501 if (!emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { 502 throw new IllegalArgumentException( 503 String.format( 504 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", 505 header, Arrays.toString(headerRecord))); 506 } 507 if (emptyHeader && !this.format.getAllowMissingColumnNames()) { 508 throw new IllegalArgumentException( 509 "A header name is missing in " + Arrays.toString(headerRecord)); 510 } 511 } 512 if (header != null) { 513 hdrMap.put(header, Integer.valueOf(i)); 514 if (headerNames == null) { 515 headerNames = new ArrayList<>(headerRecord.length); 516 } 517 headerNames.add(header); 518 } 519 } 520 } 521 } 522 if (headerNames == null) { 523 headerNames = Collections.emptyList(); //immutable 524 } else { 525 headerNames = Collections.unmodifiableList(headerNames); 526 } 527 return new Headers(hdrMap, headerNames); 528 } 529 530 /** 531 * Returns the current line number in the input stream. 532 * 533 * <p> 534 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 535 * the record number. 536 * </p> 537 * 538 * @return current line number 539 */ 540 public long getCurrentLineNumber() { 541 return this.lexer.getCurrentLineNumber(); 542 } 543 544 /** 545 * Gets the first end-of-line string encountered. 546 * 547 * @return the first end-of-line string 548 * @since 1.5 549 */ 550 public String getFirstEndOfLine() { 551 return lexer.getFirstEol(); 552 } 553 554 /** 555 * Returns a copy of the header map. 556 * <p> 557 * The map keys are column names. The map values are 0-based indices. 558 * </p> 559 * @return a copy of the header map. 560 */ 561 public Map<String, Integer> getHeaderMap() { 562 if (this.headerMap == null) { 563 return null; 564 } 565 final Map<String, Integer> map = createEmptyHeaderMap(); 566 map.putAll(this.headerMap); 567 return map; 568 } 569 570 /** 571 * Returns the header map. 572 * 573 * @return the header map. 574 */ 575 Map<String, Integer> getHeaderMapRaw() { 576 return this.headerMap; 577 } 578 579 /** 580 * Returns a read-only list of header names that iterates in column order. 581 * 582 * @return read-only list of header names that iterates in column order. 583 * @since 1.7 584 */ 585 public List<String> getHeaderNames() { 586 return headerNames; 587 } 588 589 /** 590 * Returns the current record number in the input stream. 591 * 592 * <p> 593 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 594 * the line number. 595 * </p> 596 * 597 * @return current record number 598 */ 599 public long getRecordNumber() { 600 return this.recordNumber; 601 } 602 603 /** 604 * Parses the CSV input according to the given format and returns the content as a list of 605 * {@link CSVRecord CSVRecords}. 606 * 607 * <p> 608 * The returned content starts at the current parse-position in the stream. 609 * </p> 610 * 611 * @return list of {@link CSVRecord CSVRecords}, may be empty 612 * @throws IOException 613 * on parse error or input read-failure 614 */ 615 public List<CSVRecord> getRecords() throws IOException { 616 CSVRecord rec; 617 final List<CSVRecord> records = new ArrayList<>(); 618 while ((rec = this.nextRecord()) != null) { 619 records.add(rec); 620 } 621 return records; 622 } 623 624 /** 625 * Gets whether this parser is closed. 626 * 627 * @return whether this parser is closed. 628 */ 629 public boolean isClosed() { 630 return this.lexer.isClosed(); 631 } 632 633 /** 634 * Returns an iterator on the records. 635 * 636 * <p> 637 * An {@link IOException} caught during the iteration are re-thrown as an 638 * {@link IllegalStateException}. 639 * </p> 640 * <p> 641 * If the parser is closed a call to {@link Iterator#next()} will throw a 642 * {@link NoSuchElementException}. 643 * </p> 644 */ 645 @Override 646 public Iterator<CSVRecord> iterator() { 647 return csvRecordIterator; 648 } 649 650 /** 651 * Parses the next record from the current point in the stream. 652 * 653 * @return the record as an array of values, or {@code null} if the end of the stream has been reached 654 * @throws IOException 655 * on parse error or input read-failure 656 */ 657 CSVRecord nextRecord() throws IOException { 658 CSVRecord result = null; 659 this.recordList.clear(); 660 StringBuilder sb = null; 661 final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset; 662 do { 663 this.reusableToken.reset(); 664 this.lexer.nextToken(this.reusableToken); 665 switch (this.reusableToken.type) { 666 case TOKEN: 667 this.addRecordValue(false); 668 break; 669 case EORECORD: 670 this.addRecordValue(true); 671 break; 672 case EOF: 673 if (this.reusableToken.isReady) { 674 this.addRecordValue(true); 675 } 676 break; 677 case INVALID: 678 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); 679 case COMMENT: // Ignored currently 680 if (sb == null) { // first comment for this record 681 sb = new StringBuilder(); 682 } else { 683 sb.append(Constants.LF); 684 } 685 sb.append(this.reusableToken.content); 686 this.reusableToken.type = TOKEN; // Read another token 687 break; 688 default: 689 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); 690 } 691 } while (this.reusableToken.type == TOKEN); 692 693 if (!this.recordList.isEmpty()) { 694 this.recordNumber++; 695 final String comment = sb == null ? null : sb.toString(); 696 result = new CSVRecord(this, this.recordList.toArray(new String[this.recordList.size()]), 697 comment, this.recordNumber, startCharPosition); 698 } 699 return result; 700 } 701 702}