001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.text.matcher.StringMatcher; 026import org.apache.commons.text.matcher.StringMatcherFactory; 027 028/** 029 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts. 030 * <p> 031 * This class can split a String into many smaller strings. It aims to do a similar job to 032 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including 033 * implementing the <code>ListIterator</code> interface. By default, it is set up like <code>StringTokenizer</code>. 034 * <p> 035 * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a 036 * <i>delimiter</i>. One or more delimiter characters must be specified. 037 * <p> 038 * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be 039 * escaped within a quoted section by duplicating itself. 040 * <p> 041 * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher 042 * specifies these characters. One usage might be to trim whitespace characters. 043 * <p> 044 * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies 045 * these characters to be removed. One usage might be to remove new line characters. 046 * <p> 047 * Empty tokens may be removed or returned as null. 048 * 049 * <pre> 050 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 051 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 052 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 053 * </pre> 054 * <p> 055 * 056 * This tokenizer has the following properties and options: 057 * 058 * <table summary="Tokenizer Properties"> 059 * <tr> 060 * <th>Property</th> 061 * <th>Type</th> 062 * <th>Default</th> 063 * </tr> 064 * <tr> 065 * <td>delim</td> 066 * <td>CharSetMatcher</td> 067 * <td>{ \t\n\r\f}</td> 068 * </tr> 069 * <tr> 070 * <td>quote</td> 071 * <td>NoneMatcher</td> 072 * <td>{}</td> 073 * </tr> 074 * <tr> 075 * <td>ignore</td> 076 * <td>NoneMatcher</td> 077 * <td>{}</td> 078 * </tr> 079 * <tr> 080 * <td>emptyTokenAsNull</td> 081 * <td>boolean</td> 082 * <td>false</td> 083 * </tr> 084 * <tr> 085 * <td>ignoreEmptyTokens</td> 086 * <td>boolean</td> 087 * <td>true</td> 088 * </tr> 089 * </table> 090 * 091 * @since 1.3 092 */ 093public class StringTokenizer implements ListIterator<String>, Cloneable { 094 095 /** Comma separated values tokenizer internal variable. */ 096 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE; 097 /** Tab separated values tokenizer internal variable. */ 098 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE; 099 static { 100 CSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 101 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher()); 102 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 103 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 104 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 105 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 106 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 107 108 TSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 109 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher()); 110 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 111 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 112 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 113 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 114 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 115 } 116 117 /** The text to work on. */ 118 private char[] chars; 119 /** The parsed tokens. */ 120 private String[] tokens; 121 /** The current iteration position. */ 122 private int tokenPos; 123 124 /** The delimiter matcher. */ 125 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher(); 126 /** The quote matcher. */ 127 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 128 /** The ignored matcher. */ 129 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 130 /** The trimmer matcher. */ 131 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 132 133 /** Whether to return empty tokens as null. */ 134 private boolean emptyAsNull = false; 135 /** Whether to ignore empty tokens. */ 136 private boolean ignoreEmptyTokens = true; 137 138 // ----------------------------------------------------------------------- 139 140 /** 141 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 142 * 143 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 144 */ 145 private static StringTokenizer getCSVClone() { 146 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 147 } 148 149 /** 150 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 151 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 152 * setTrimmer method). 153 * <p> 154 * You must call a "reset" method to set the string which you want to parse. 155 * 156 * @return a new tokenizer instance which parses Comma Separated Value strings 157 */ 158 public static StringTokenizer getCSVInstance() { 159 return getCSVClone(); 160 } 161 162 /** 163 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 164 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 165 * setTrimmer method). 166 * 167 * @param input 168 * the text to parse 169 * @return a new tokenizer instance which parses Comma Separated Value strings 170 */ 171 public static StringTokenizer getCSVInstance(final String input) { 172 final StringTokenizer tok = getCSVClone(); 173 tok.reset(input); 174 return tok; 175 } 176 177 /** 178 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 179 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 180 * setTrimmer method). 181 * 182 * @param input 183 * the text to parse 184 * @return a new tokenizer instance which parses Comma Separated Value strings 185 */ 186 public static StringTokenizer getCSVInstance(final char[] input) { 187 final StringTokenizer tok = getCSVClone(); 188 tok.reset(input); 189 return tok; 190 } 191 192 /** 193 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 194 * 195 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 196 */ 197 private static StringTokenizer getTSVClone() { 198 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 199 } 200 201 /** 202 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 203 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 204 * <p> 205 * You must call a "reset" method to set the string which you want to parse. 206 * 207 * @return a new tokenizer instance which parses Tab Separated Value strings. 208 */ 209 public static StringTokenizer getTSVInstance() { 210 return getTSVClone(); 211 } 212 213 /** 214 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 215 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 216 * 217 * @param input 218 * the string to parse 219 * @return a new tokenizer instance which parses Tab Separated Value strings. 220 */ 221 public static StringTokenizer getTSVInstance(final String input) { 222 final StringTokenizer tok = getTSVClone(); 223 tok.reset(input); 224 return tok; 225 } 226 227 /** 228 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 229 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 230 * 231 * @param input 232 * the string to parse 233 * @return a new tokenizer instance which parses Tab Separated Value strings. 234 */ 235 public static StringTokenizer getTSVInstance(final char[] input) { 236 final StringTokenizer tok = getTSVClone(); 237 tok.reset(input); 238 return tok; 239 } 240 241 // ----------------------------------------------------------------------- 242 /** 243 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to 244 * tokenize. 245 * <p> 246 * This constructor is normally used with {@link #reset(String)}. 247 */ 248 public StringTokenizer() { 249 super(); 250 this.chars = null; 251 } 252 253 /** 254 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 255 * 256 * @param input 257 * the string which is to be parsed 258 */ 259 public StringTokenizer(final String input) { 260 super(); 261 if (input != null) { 262 chars = input.toCharArray(); 263 } else { 264 chars = null; 265 } 266 } 267 268 /** 269 * Constructs a tokenizer splitting on the specified delimiter character. 270 * 271 * @param input 272 * the string which is to be parsed 273 * @param delim 274 * the field delimiter character 275 */ 276 public StringTokenizer(final String input, final char delim) { 277 this(input); 278 setDelimiterChar(delim); 279 } 280 281 /** 282 * Constructs a tokenizer splitting on the specified delimiter string. 283 * 284 * @param input 285 * the string which is to be parsed 286 * @param delim 287 * the field delimiter string 288 */ 289 public StringTokenizer(final String input, final String delim) { 290 this(input); 291 setDelimiterString(delim); 292 } 293 294 /** 295 * Constructs a tokenizer splitting using the specified delimiter matcher. 296 * 297 * @param input 298 * the string which is to be parsed 299 * @param delim 300 * the field delimiter matcher 301 */ 302 public StringTokenizer(final String input, final StringMatcher delim) { 303 this(input); 304 setDelimiterMatcher(delim); 305 } 306 307 /** 308 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 309 * quote character. 310 * 311 * @param input 312 * the string which is to be parsed 313 * @param delim 314 * the field delimiter character 315 * @param quote 316 * the field quoted string character 317 */ 318 public StringTokenizer(final String input, final char delim, final char quote) { 319 this(input, delim); 320 setQuoteChar(quote); 321 } 322 323 /** 324 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 325 * quote matcher. 326 * 327 * @param input 328 * the string which is to be parsed 329 * @param delim 330 * the field delimiter matcher 331 * @param quote 332 * the field quoted string matcher 333 */ 334 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) { 335 this(input, delim); 336 setQuoteMatcher(quote); 337 } 338 339 /** 340 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 341 * 342 * @param input 343 * the string which is to be parsed, not cloned 344 */ 345 public StringTokenizer(final char[] input) { 346 super(); 347 if (input == null) { 348 this.chars = null; 349 } else { 350 this.chars = input.clone(); 351 } 352 } 353 354 /** 355 * Constructs a tokenizer splitting on the specified character. 356 * 357 * @param input 358 * the string which is to be parsed, not cloned 359 * @param delim 360 * the field delimiter character 361 */ 362 public StringTokenizer(final char[] input, final char delim) { 363 this(input); 364 setDelimiterChar(delim); 365 } 366 367 /** 368 * Constructs a tokenizer splitting on the specified string. 369 * 370 * @param input 371 * the string which is to be parsed, not cloned 372 * @param delim 373 * the field delimiter string 374 */ 375 public StringTokenizer(final char[] input, final String delim) { 376 this(input); 377 setDelimiterString(delim); 378 } 379 380 /** 381 * Constructs a tokenizer splitting using the specified delimiter matcher. 382 * 383 * @param input 384 * the string which is to be parsed, not cloned 385 * @param delim 386 * the field delimiter matcher 387 */ 388 public StringTokenizer(final char[] input, final StringMatcher delim) { 389 this(input); 390 setDelimiterMatcher(delim); 391 } 392 393 /** 394 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 395 * quote character. 396 * 397 * @param input 398 * the string which is to be parsed, not cloned 399 * @param delim 400 * the field delimiter character 401 * @param quote 402 * the field quoted string character 403 */ 404 public StringTokenizer(final char[] input, final char delim, final char quote) { 405 this(input, delim); 406 setQuoteChar(quote); 407 } 408 409 /** 410 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 411 * quote matcher. 412 * 413 * @param input 414 * the string which is to be parsed, not cloned 415 * @param delim 416 * the field delimiter character 417 * @param quote 418 * the field quoted string character 419 */ 420 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) { 421 this(input, delim); 422 setQuoteMatcher(quote); 423 } 424 425 // API 426 // ----------------------------------------------------------------------- 427 /** 428 * Gets the number of tokens found in the String. 429 * 430 * @return the number of matched tokens 431 */ 432 public int size() { 433 checkTokenized(); 434 return tokens.length; 435 } 436 437 /** 438 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing 439 * {@link NoSuchElementException} when no tokens remain. 440 * 441 * @return the next sequential token, or null when no more tokens are found 442 */ 443 public String nextToken() { 444 if (hasNext()) { 445 return tokens[tokenPos++]; 446 } 447 return null; 448 } 449 450 /** 451 * Gets the previous token from the String. 452 * 453 * @return the previous sequential token, or null when no more tokens are found 454 */ 455 public String previousToken() { 456 if (hasPrevious()) { 457 return tokens[--tokenPos]; 458 } 459 return null; 460 } 461 462 /** 463 * Gets a copy of the full token list as an independent modifiable array. 464 * 465 * @return the tokens as a String array 466 */ 467 public String[] getTokenArray() { 468 checkTokenized(); 469 return tokens.clone(); 470 } 471 472 /** 473 * Gets a copy of the full token list as an independent modifiable list. 474 * 475 * @return the tokens as a String array 476 */ 477 public List<String> getTokenList() { 478 checkTokenized(); 479 final List<String> list = new ArrayList<>(tokens.length); 480 Collections.addAll(list, tokens); 481 482 return list; 483 } 484 485 /** 486 * Resets this tokenizer, forgetting all parsing and iteration already completed. 487 * <p> 488 * This method allows the same tokenizer to be reused for the same String. 489 * 490 * @return this, to enable chaining 491 */ 492 public StringTokenizer reset() { 493 tokenPos = 0; 494 tokens = null; 495 return this; 496 } 497 498 /** 499 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 500 * same settings on multiple input lines. 501 * 502 * @param input 503 * the new string to tokenize, null sets no text to parse 504 * @return this, to enable chaining 505 */ 506 public StringTokenizer reset(final String input) { 507 reset(); 508 if (input != null) { 509 this.chars = input.toCharArray(); 510 } else { 511 this.chars = null; 512 } 513 return this; 514 } 515 516 /** 517 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 518 * same settings on multiple input lines. 519 * 520 * @param input 521 * the new character array to tokenize, not cloned, null sets no text to parse 522 * @return this, to enable chaining 523 */ 524 public StringTokenizer reset(final char[] input) { 525 reset(); 526 if (input != null) { 527 this.chars = input.clone(); 528 } else { 529 this.chars = null; 530 } 531 return this; 532 } 533 534 // ListIterator 535 // ----------------------------------------------------------------------- 536 /** 537 * Checks whether there are any more tokens. 538 * 539 * @return true if there are more tokens 540 */ 541 @Override 542 public boolean hasNext() { 543 checkTokenized(); 544 return tokenPos < tokens.length; 545 } 546 547 /** 548 * Gets the next token. 549 * 550 * @return the next String token 551 * @throws NoSuchElementException 552 * if there are no more elements 553 */ 554 @Override 555 public String next() { 556 if (hasNext()) { 557 return tokens[tokenPos++]; 558 } 559 throw new NoSuchElementException(); 560 } 561 562 /** 563 * Gets the index of the next token to return. 564 * 565 * @return the next token index 566 */ 567 @Override 568 public int nextIndex() { 569 return tokenPos; 570 } 571 572 /** 573 * Checks whether there are any previous tokens that can be iterated to. 574 * 575 * @return true if there are previous tokens 576 */ 577 @Override 578 public boolean hasPrevious() { 579 checkTokenized(); 580 return tokenPos > 0; 581 } 582 583 /** 584 * Gets the token previous to the last returned token. 585 * 586 * @return the previous token 587 */ 588 @Override 589 public String previous() { 590 if (hasPrevious()) { 591 return tokens[--tokenPos]; 592 } 593 throw new NoSuchElementException(); 594 } 595 596 /** 597 * Gets the index of the previous token. 598 * 599 * @return the previous token index 600 */ 601 @Override 602 public int previousIndex() { 603 return tokenPos - 1; 604 } 605 606 /** 607 * Unsupported ListIterator operation. 608 * 609 * @throws UnsupportedOperationException 610 * always 611 */ 612 @Override 613 public void remove() { 614 throw new UnsupportedOperationException("remove() is unsupported"); 615 } 616 617 /** 618 * Unsupported ListIterator operation. 619 * 620 * @param obj 621 * this parameter ignored. 622 * @throws UnsupportedOperationException 623 * always 624 */ 625 @Override 626 public void set(final String obj) { 627 throw new UnsupportedOperationException("set() is unsupported"); 628 } 629 630 /** 631 * Unsupported ListIterator operation. 632 * 633 * @param obj 634 * this parameter ignored. 635 * @throws UnsupportedOperationException 636 * always 637 */ 638 @Override 639 public void add(final String obj) { 640 throw new UnsupportedOperationException("add() is unsupported"); 641 } 642 643 // Implementation 644 // ----------------------------------------------------------------------- 645 /** 646 * Checks if tokenization has been done, and if not then do it. 647 */ 648 private void checkTokenized() { 649 if (tokens == null) { 650 if (chars == null) { 651 // still call tokenize as subclass may do some work 652 final List<String> split = tokenize(null, 0, 0); 653 tokens = split.toArray(new String[split.size()]); 654 } else { 655 final List<String> split = tokenize(chars, 0, chars.length); 656 tokens = split.toArray(new String[split.size()]); 657 } 658 } 659 } 660 661 /** 662 * Internal method to performs the tokenization. 663 * <p> 664 * Most users of this class do not need to call this method. This method will be called automatically by other 665 * (public) methods when required. 666 * <p> 667 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass 668 * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple 669 * strings. It is also be possible to filter the results. 670 * <p> 671 * <code>StrTokenizer</code> will always pass a zero offset and a count equal to the length of the array to this 672 * method, however a subclass may pass other values, or even an entirely different array. 673 * 674 * @param srcChars 675 * the character array being tokenized, may be null 676 * @param offset 677 * the start position within the character array, must be valid 678 * @param count 679 * the number of characters to tokenize, must be valid 680 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 681 */ 682 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 683 if (srcChars == null || count == 0) { 684 return Collections.emptyList(); 685 } 686 final TextStringBuilder buf = new TextStringBuilder(); 687 final List<String> tokenList = new ArrayList<>(); 688 int pos = offset; 689 690 // loop around the entire buffer 691 while (pos >= 0 && pos < count) { 692 // find next token 693 pos = readNextToken(srcChars, pos, count, buf, tokenList); 694 695 // handle case where end of string is a delimiter 696 if (pos >= count) { 697 addToken(tokenList, ""); 698 } 699 } 700 return tokenList; 701 } 702 703 /** 704 * Adds a token to a list, paying attention to the parameters we've set. 705 * 706 * @param list 707 * the list to add to 708 * @param tok 709 * the token to add 710 */ 711 private void addToken(final List<String> list, String tok) { 712 if (tok == null || tok.length() == 0) { 713 if (isIgnoreEmptyTokens()) { 714 return; 715 } 716 if (isEmptyTokenAsNull()) { 717 tok = null; 718 } 719 } 720 list.add(tok); 721 } 722 723 /** 724 * Reads character by character through the String to get the next token. 725 * 726 * @param srcChars 727 * the character array being tokenized 728 * @param start 729 * the first character of field 730 * @param len 731 * the length of the character array being tokenized 732 * @param workArea 733 * a temporary work area 734 * @param tokenList 735 * the list of parsed tokens 736 * @return the starting position of the next field (the character immediately after the delimiter), or -1 if end of 737 * string found 738 */ 739 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea, 740 final List<String> tokenList) { 741 // skip all leading whitespace, unless it is the 742 // field delimiter or the quote character 743 while (start < len) { 744 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len), 745 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 746 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 747 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 748 break; 749 } 750 start += removeLen; 751 } 752 753 // handle reaching end 754 if (start >= len) { 755 addToken(tokenList, ""); 756 return -1; 757 } 758 759 // handle empty token 760 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 761 if (delimLen > 0) { 762 addToken(tokenList, ""); 763 return start + delimLen; 764 } 765 766 // handle found token 767 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 768 if (quoteLen > 0) { 769 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 770 } 771 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 772 } 773 774 /** 775 * Reads a possibly quoted string token. 776 * 777 * @param srcChars 778 * the character array being tokenized 779 * @param start 780 * the first character of field 781 * @param len 782 * the length of the character array being tokenized 783 * @param workArea 784 * a temporary work area 785 * @param tokenList 786 * the list of parsed tokens 787 * @param quoteStart 788 * the start position of the matched quote, 0 if no quoting 789 * @param quoteLen 790 * the length of the matched quote, 0 if no quoting 791 * @return the starting position of the next field (the character immediately after the delimiter, or if end of 792 * string found, then the length of string 793 */ 794 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea, 795 final List<String> tokenList, final int quoteStart, final int quoteLen) { 796 // Loop until we've found the end of the quoted 797 // string or the end of the input 798 workArea.clear(); 799 int pos = start; 800 boolean quoting = quoteLen > 0; 801 int trimStart = 0; 802 803 while (pos < len) { 804 // quoting mode can occur several times throughout a string 805 // we must switch between quoting and non-quoting until we 806 // encounter a non-quoted delimiter, or end of string 807 if (quoting) { 808 // In quoting mode 809 810 // If we've found a quote character, see if it's 811 // followed by a second quote. If so, then we need 812 // to actually put the quote character into the token 813 // rather than end the token. 814 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 815 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 816 // matched pair of quotes, thus an escaped quote 817 workArea.append(srcChars, pos, quoteLen); 818 pos += quoteLen * 2; 819 trimStart = workArea.size(); 820 continue; 821 } 822 823 // end of quoting 824 quoting = false; 825 pos += quoteLen; 826 continue; 827 } 828 829 // copy regular character from inside quotes 830 workArea.append(srcChars[pos++]); 831 trimStart = workArea.size(); 832 833 } else { 834 // Not in quoting mode 835 836 // check for delimiter, and thus end of token 837 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 838 if (delimLen > 0) { 839 // return condition when end of token found 840 addToken(tokenList, workArea.substring(0, trimStart)); 841 return pos + delimLen; 842 } 843 844 // check for quote, and thus back into quoting mode 845 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 846 quoting = true; 847 pos += quoteLen; 848 continue; 849 } 850 851 // check for ignored (outside quotes), and ignore 852 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 853 if (ignoredLen > 0) { 854 pos += ignoredLen; 855 continue; 856 } 857 858 // check for trimmed character 859 // don't yet know if its at the end, so copy to workArea 860 // use trimStart to keep track of trim at the end 861 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 862 if (trimmedLen > 0) { 863 workArea.append(srcChars, pos, trimmedLen); 864 pos += trimmedLen; 865 continue; 866 } 867 868 // copy regular character from outside quotes 869 workArea.append(srcChars[pos++]); 870 trimStart = workArea.size(); 871 } 872 } 873 874 // return condition when end of string found 875 addToken(tokenList, workArea.substring(0, trimStart)); 876 return -1; 877 } 878 879 /** 880 * Checks if the characters at the index specified match the quote already matched in readNextToken(). 881 * 882 * @param srcChars 883 * the character array being tokenized 884 * @param pos 885 * the position to check for a quote 886 * @param len 887 * the length of the character array being tokenized 888 * @param quoteStart 889 * the start position of the matched quote, 0 if no quoting 890 * @param quoteLen 891 * the length of the matched quote, 0 if no quoting 892 * @return true if a quote is matched 893 */ 894 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, 895 final int quoteLen) { 896 for (int i = 0; i < quoteLen; i++) { 897 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 898 return false; 899 } 900 } 901 return true; 902 } 903 904 // Delimiter 905 // ----------------------------------------------------------------------- 906 /** 907 * Gets the field delimiter matcher. 908 * 909 * @return the delimiter matcher in use 910 */ 911 public StringMatcher getDelimiterMatcher() { 912 return this.delimMatcher; 913 } 914 915 /** 916 * Sets the field delimiter matcher. 917 * <p> 918 * The delimiter is used to separate one token from another. 919 * 920 * @param delim 921 * the delimiter matcher to use 922 * @return this, to enable chaining 923 */ 924 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) { 925 if (delim == null) { 926 this.delimMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 927 } else { 928 this.delimMatcher = delim; 929 } 930 return this; 931 } 932 933 /** 934 * Sets the field delimiter character. 935 * 936 * @param delim 937 * the delimiter character to use 938 * @return this, to enable chaining 939 */ 940 public StringTokenizer setDelimiterChar(final char delim) { 941 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim)); 942 } 943 944 /** 945 * Sets the field delimiter string. 946 * 947 * @param delim 948 * the delimiter string to use 949 * @return this, to enable chaining 950 */ 951 public StringTokenizer setDelimiterString(final String delim) { 952 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim)); 953 } 954 955 // Quote 956 // ----------------------------------------------------------------------- 957 /** 958 * Gets the quote matcher currently in use. 959 * <p> 960 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The 961 * default value is '"' (double quote). 962 * 963 * @return the quote matcher in use 964 */ 965 public StringMatcher getQuoteMatcher() { 966 return quoteMatcher; 967 } 968 969 /** 970 * Set the quote matcher to use. 971 * <p> 972 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 973 * 974 * @param quote 975 * the quote matcher to use, null ignored 976 * @return this, to enable chaining 977 */ 978 public StringTokenizer setQuoteMatcher(final StringMatcher quote) { 979 if (quote != null) { 980 this.quoteMatcher = quote; 981 } 982 return this; 983 } 984 985 /** 986 * Sets the quote character to use. 987 * <p> 988 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 989 * 990 * @param quote 991 * the quote character to use 992 * @return this, to enable chaining 993 */ 994 public StringTokenizer setQuoteChar(final char quote) { 995 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote)); 996 } 997 998 // Ignored 999 // ----------------------------------------------------------------------- 1000 /** 1001 * Gets the ignored character matcher. 1002 * <p> 1003 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value 1004 * is not to ignore anything. 1005 * 1006 * @return the ignored matcher in use 1007 */ 1008 public StringMatcher getIgnoredMatcher() { 1009 return ignoredMatcher; 1010 } 1011 1012 /** 1013 * Set the matcher for characters to ignore. 1014 * <p> 1015 * These characters are ignored when parsing the String, unless they are within a quoted region. 1016 * 1017 * @param ignored 1018 * the ignored matcher to use, null ignored 1019 * @return this, to enable chaining 1020 */ 1021 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) { 1022 if (ignored != null) { 1023 this.ignoredMatcher = ignored; 1024 } 1025 return this; 1026 } 1027 1028 /** 1029 * Set the character to ignore. 1030 * <p> 1031 * This character is ignored when parsing the String, unless it is within a quoted region. 1032 * 1033 * @param ignored 1034 * the ignored character to use 1035 * @return this, to enable chaining 1036 */ 1037 public StringTokenizer setIgnoredChar(final char ignored) { 1038 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored)); 1039 } 1040 1041 // Trimmer 1042 // ----------------------------------------------------------------------- 1043 /** 1044 * Gets the trimmer character matcher. 1045 * <p> 1046 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default 1047 * value is not to trim anything. 1048 * 1049 * @return the trimmer matcher in use 1050 */ 1051 public StringMatcher getTrimmerMatcher() { 1052 return trimmerMatcher; 1053 } 1054 1055 /** 1056 * Sets the matcher for characters to trim. 1057 * <p> 1058 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 1059 * 1060 * @param trimmer 1061 * the trimmer matcher to use, null ignored 1062 * @return this, to enable chaining 1063 */ 1064 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) { 1065 if (trimmer != null) { 1066 this.trimmerMatcher = trimmer; 1067 } 1068 return this; 1069 } 1070 1071 // ----------------------------------------------------------------------- 1072 /** 1073 * Gets whether the tokenizer currently returns empty tokens as null. The default for this property is false. 1074 * 1075 * @return true if empty tokens are returned as null 1076 */ 1077 public boolean isEmptyTokenAsNull() { 1078 return this.emptyAsNull; 1079 } 1080 1081 /** 1082 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 1083 * 1084 * @param emptyAsNull 1085 * whether empty tokens are returned as null 1086 * @return this, to enable chaining 1087 */ 1088 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1089 this.emptyAsNull = emptyAsNull; 1090 return this; 1091 } 1092 1093 // ----------------------------------------------------------------------- 1094 /** 1095 * Gets whether the tokenizer currently ignores empty tokens. The default for this property is true. 1096 * 1097 * @return true if empty tokens are not returned 1098 */ 1099 public boolean isIgnoreEmptyTokens() { 1100 return ignoreEmptyTokens; 1101 } 1102 1103 /** 1104 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 1105 * 1106 * @param ignoreEmptyTokens 1107 * whether empty tokens are not returned 1108 * @return this, to enable chaining 1109 */ 1110 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1111 this.ignoreEmptyTokens = ignoreEmptyTokens; 1112 return this; 1113 } 1114 1115 // ----------------------------------------------------------------------- 1116 /** 1117 * Gets the String content that the tokenizer is parsing. 1118 * 1119 * @return the string content being parsed 1120 */ 1121 public String getContent() { 1122 if (chars == null) { 1123 return null; 1124 } 1125 return new String(chars); 1126 } 1127 1128 // ----------------------------------------------------------------------- 1129 /** 1130 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 1131 * list. If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1132 * 1133 * @return a new instance of this Tokenizer which has been reset. 1134 */ 1135 @Override 1136 public Object clone() { 1137 try { 1138 return cloneReset(); 1139 } catch (final CloneNotSupportedException ex) { 1140 return null; 1141 } 1142 } 1143 1144 /** 1145 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 1146 * list. 1147 * 1148 * @return a new instance of this Tokenizer which has been reset. 1149 * @throws CloneNotSupportedException 1150 * if there is a problem cloning 1151 */ 1152 Object cloneReset() throws CloneNotSupportedException { 1153 // this method exists to enable 100% test coverage 1154 final StringTokenizer cloned = (StringTokenizer) super.clone(); 1155 if (cloned.chars != null) { 1156 cloned.chars = cloned.chars.clone(); 1157 } 1158 cloned.reset(); 1159 return cloned; 1160 } 1161 1162 // ----------------------------------------------------------------------- 1163 /** 1164 * Gets the String content that the tokenizer is parsing. 1165 * 1166 * @return the string content being parsed 1167 */ 1168 @Override 1169 public String toString() { 1170 if (tokens == null) { 1171 return "StringTokenizer[not tokenized yet]"; 1172 } 1173 return "StringTokenizer" + getTokenList(); 1174 } 1175 1176}