001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.text.matcher.StringMatcher;
026import org.apache.commons.text.matcher.StringMatcherFactory;
027
028/**
029 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
030 * <p>
031 * This class can split a String into many smaller strings. It aims to do a similar job to
032 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
033 * implementing the <code>ListIterator</code> interface. By default, it is set up like <code>StringTokenizer</code>.
034 * <p>
035 * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a
036 * <i>delimiter</i>. One or more delimiter characters must be specified.
037 * <p>
038 * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be
039 * escaped within a quoted section by duplicating itself.
040 * <p>
041 * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher
042 * specifies these characters. One usage might be to trim whitespace characters.
043 * <p>
044 * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies
045 * these characters to be removed. One usage might be to remove new line characters.
046 * <p>
047 * Empty tokens may be removed or returned as null.
048 *
049 * <pre>
050 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
051 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
052 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
053 * </pre>
054 * <p>
055 *
056 * This tokenizer has the following properties and options:
057 *
058 * <table summary="Tokenizer Properties">
059 * <tr>
060 * <th>Property</th>
061 * <th>Type</th>
062 * <th>Default</th>
063 * </tr>
064 * <tr>
065 * <td>delim</td>
066 * <td>CharSetMatcher</td>
067 * <td>{ \t\n\r\f}</td>
068 * </tr>
069 * <tr>
070 * <td>quote</td>
071 * <td>NoneMatcher</td>
072 * <td>{}</td>
073 * </tr>
074 * <tr>
075 * <td>ignore</td>
076 * <td>NoneMatcher</td>
077 * <td>{}</td>
078 * </tr>
079 * <tr>
080 * <td>emptyTokenAsNull</td>
081 * <td>boolean</td>
082 * <td>false</td>
083 * </tr>
084 * <tr>
085 * <td>ignoreEmptyTokens</td>
086 * <td>boolean</td>
087 * <td>true</td>
088 * </tr>
089 * </table>
090 *
091 * @since 1.3
092 */
093public class StringTokenizer implements ListIterator<String>, Cloneable {
094
095    /** Comma separated values tokenizer internal variable. */
096    private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;
097    /** Tab separated values tokenizer internal variable. */
098    private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;
099    static {
100        CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
101        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
102        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
103        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
104        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
105        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
106        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
107
108        TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
109        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
110        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
111        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
112        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
113        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
114        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
115    }
116
117    /** The text to work on. */
118    private char[] chars;
119    /** The parsed tokens. */
120    private String[] tokens;
121    /** The current iteration position. */
122    private int tokenPos;
123
124    /** The delimiter matcher. */
125    private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
126    /** The quote matcher. */
127    private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
128    /** The ignored matcher. */
129    private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
130    /** The trimmer matcher. */
131    private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
132
133    /** Whether to return empty tokens as null. */
134    private boolean emptyAsNull = false;
135    /** Whether to ignore empty tokens. */
136    private boolean ignoreEmptyTokens = true;
137
138    // -----------------------------------------------------------------------
139
140    /**
141     * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
142     *
143     * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
144     */
145    private static StringTokenizer getCSVClone() {
146        return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
147    }
148
149    /**
150     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
151     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
152     * setTrimmer method).
153     * <p>
154     * You must call a "reset" method to set the string which you want to parse.
155     *
156     * @return a new tokenizer instance which parses Comma Separated Value strings
157     */
158    public static StringTokenizer getCSVInstance() {
159        return getCSVClone();
160    }
161
162    /**
163     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
164     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
165     * setTrimmer method).
166     *
167     * @param input
168     *            the text to parse
169     * @return a new tokenizer instance which parses Comma Separated Value strings
170     */
171    public static StringTokenizer getCSVInstance(final String input) {
172        final StringTokenizer tok = getCSVClone();
173        tok.reset(input);
174        return tok;
175    }
176
177    /**
178     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
179     * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
180     * setTrimmer method).
181     *
182     * @param input
183     *            the text to parse
184     * @return a new tokenizer instance which parses Comma Separated Value strings
185     */
186    public static StringTokenizer getCSVInstance(final char[] input) {
187        final StringTokenizer tok = getCSVClone();
188        tok.reset(input);
189        return tok;
190    }
191
192    /**
193     * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
194     *
195     * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
196     */
197    private static StringTokenizer getTSVClone() {
198        return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
199    }
200
201    /**
202     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
203     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
204     * <p>
205     * You must call a "reset" method to set the string which you want to parse.
206     *
207     * @return a new tokenizer instance which parses Tab Separated Value strings.
208     */
209    public static StringTokenizer getTSVInstance() {
210        return getTSVClone();
211    }
212
213    /**
214     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
215     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
216     *
217     * @param input
218     *            the string to parse
219     * @return a new tokenizer instance which parses Tab Separated Value strings.
220     */
221    public static StringTokenizer getTSVInstance(final String input) {
222        final StringTokenizer tok = getTSVClone();
223        tok.reset(input);
224        return tok;
225    }
226
227    /**
228     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
229     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
230     *
231     * @param input
232     *            the string to parse
233     * @return a new tokenizer instance which parses Tab Separated Value strings.
234     */
235    public static StringTokenizer getTSVInstance(final char[] input) {
236        final StringTokenizer tok = getTSVClone();
237        tok.reset(input);
238        return tok;
239    }
240
241    // -----------------------------------------------------------------------
242    /**
243     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
244     * tokenize.
245     * <p>
246     * This constructor is normally used with {@link #reset(String)}.
247     */
248    public StringTokenizer() {
249        super();
250        this.chars = null;
251    }
252
253    /**
254     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
255     *
256     * @param input
257     *            the string which is to be parsed
258     */
259    public StringTokenizer(final String input) {
260        super();
261        if (input != null) {
262            chars = input.toCharArray();
263        } else {
264            chars = null;
265        }
266    }
267
268    /**
269     * Constructs a tokenizer splitting on the specified delimiter character.
270     *
271     * @param input
272     *            the string which is to be parsed
273     * @param delim
274     *            the field delimiter character
275     */
276    public StringTokenizer(final String input, final char delim) {
277        this(input);
278        setDelimiterChar(delim);
279    }
280
281    /**
282     * Constructs a tokenizer splitting on the specified delimiter string.
283     *
284     * @param input
285     *            the string which is to be parsed
286     * @param delim
287     *            the field delimiter string
288     */
289    public StringTokenizer(final String input, final String delim) {
290        this(input);
291        setDelimiterString(delim);
292    }
293
294    /**
295     * Constructs a tokenizer splitting using the specified delimiter matcher.
296     *
297     * @param input
298     *            the string which is to be parsed
299     * @param delim
300     *            the field delimiter matcher
301     */
302    public StringTokenizer(final String input, final StringMatcher delim) {
303        this(input);
304        setDelimiterMatcher(delim);
305    }
306
307    /**
308     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
309     * quote character.
310     *
311     * @param input
312     *            the string which is to be parsed
313     * @param delim
314     *            the field delimiter character
315     * @param quote
316     *            the field quoted string character
317     */
318    public StringTokenizer(final String input, final char delim, final char quote) {
319        this(input, delim);
320        setQuoteChar(quote);
321    }
322
323    /**
324     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
325     * quote matcher.
326     *
327     * @param input
328     *            the string which is to be parsed
329     * @param delim
330     *            the field delimiter matcher
331     * @param quote
332     *            the field quoted string matcher
333     */
334    public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
335        this(input, delim);
336        setQuoteMatcher(quote);
337    }
338
339    /**
340     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
341     *
342     * @param input
343     *            the string which is to be parsed, not cloned
344     */
345    public StringTokenizer(final char[] input) {
346        super();
347        if (input == null) {
348            this.chars = null;
349        } else {
350            this.chars = input.clone();
351        }
352    }
353
354    /**
355     * Constructs a tokenizer splitting on the specified character.
356     *
357     * @param input
358     *            the string which is to be parsed, not cloned
359     * @param delim
360     *            the field delimiter character
361     */
362    public StringTokenizer(final char[] input, final char delim) {
363        this(input);
364        setDelimiterChar(delim);
365    }
366
367    /**
368     * Constructs a tokenizer splitting on the specified string.
369     *
370     * @param input
371     *            the string which is to be parsed, not cloned
372     * @param delim
373     *            the field delimiter string
374     */
375    public StringTokenizer(final char[] input, final String delim) {
376        this(input);
377        setDelimiterString(delim);
378    }
379
380    /**
381     * Constructs a tokenizer splitting using the specified delimiter matcher.
382     *
383     * @param input
384     *            the string which is to be parsed, not cloned
385     * @param delim
386     *            the field delimiter matcher
387     */
388    public StringTokenizer(final char[] input, final StringMatcher delim) {
389        this(input);
390        setDelimiterMatcher(delim);
391    }
392
393    /**
394     * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
395     * quote character.
396     *
397     * @param input
398     *            the string which is to be parsed, not cloned
399     * @param delim
400     *            the field delimiter character
401     * @param quote
402     *            the field quoted string character
403     */
404    public StringTokenizer(final char[] input, final char delim, final char quote) {
405        this(input, delim);
406        setQuoteChar(quote);
407    }
408
409    /**
410     * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
411     * quote matcher.
412     *
413     * @param input
414     *            the string which is to be parsed, not cloned
415     * @param delim
416     *            the field delimiter character
417     * @param quote
418     *            the field quoted string character
419     */
420    public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
421        this(input, delim);
422        setQuoteMatcher(quote);
423    }
424
425    // API
426    // -----------------------------------------------------------------------
427    /**
428     * Gets the number of tokens found in the String.
429     *
430     * @return the number of matched tokens
431     */
432    public int size() {
433        checkTokenized();
434        return tokens.length;
435    }
436
437    /**
438     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
439     * {@link NoSuchElementException} when no tokens remain.
440     *
441     * @return the next sequential token, or null when no more tokens are found
442     */
443    public String nextToken() {
444        if (hasNext()) {
445            return tokens[tokenPos++];
446        }
447        return null;
448    }
449
450    /**
451     * Gets the previous token from the String.
452     *
453     * @return the previous sequential token, or null when no more tokens are found
454     */
455    public String previousToken() {
456        if (hasPrevious()) {
457            return tokens[--tokenPos];
458        }
459        return null;
460    }
461
462    /**
463     * Gets a copy of the full token list as an independent modifiable array.
464     *
465     * @return the tokens as a String array
466     */
467    public String[] getTokenArray() {
468        checkTokenized();
469        return tokens.clone();
470    }
471
472    /**
473     * Gets a copy of the full token list as an independent modifiable list.
474     *
475     * @return the tokens as a String array
476     */
477    public List<String> getTokenList() {
478        checkTokenized();
479        final List<String> list = new ArrayList<>(tokens.length);
480        Collections.addAll(list, tokens);
481
482        return list;
483    }
484
485    /**
486     * Resets this tokenizer, forgetting all parsing and iteration already completed.
487     * <p>
488     * This method allows the same tokenizer to be reused for the same String.
489     *
490     * @return this, to enable chaining
491     */
492    public StringTokenizer reset() {
493        tokenPos = 0;
494        tokens = null;
495        return this;
496    }
497
498    /**
499     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
500     * same settings on multiple input lines.
501     *
502     * @param input
503     *            the new string to tokenize, null sets no text to parse
504     * @return this, to enable chaining
505     */
506    public StringTokenizer reset(final String input) {
507        reset();
508        if (input != null) {
509            this.chars = input.toCharArray();
510        } else {
511            this.chars = null;
512        }
513        return this;
514    }
515
516    /**
517     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
518     * same settings on multiple input lines.
519     *
520     * @param input
521     *            the new character array to tokenize, not cloned, null sets no text to parse
522     * @return this, to enable chaining
523     */
524    public StringTokenizer reset(final char[] input) {
525        reset();
526        if (input != null) {
527            this.chars = input.clone();
528        } else {
529            this.chars = null;
530        }
531        return this;
532    }
533
534    // ListIterator
535    // -----------------------------------------------------------------------
536    /**
537     * Checks whether there are any more tokens.
538     *
539     * @return true if there are more tokens
540     */
541    @Override
542    public boolean hasNext() {
543        checkTokenized();
544        return tokenPos < tokens.length;
545    }
546
547    /**
548     * Gets the next token.
549     *
550     * @return the next String token
551     * @throws NoSuchElementException
552     *             if there are no more elements
553     */
554    @Override
555    public String next() {
556        if (hasNext()) {
557            return tokens[tokenPos++];
558        }
559        throw new NoSuchElementException();
560    }
561
562    /**
563     * Gets the index of the next token to return.
564     *
565     * @return the next token index
566     */
567    @Override
568    public int nextIndex() {
569        return tokenPos;
570    }
571
572    /**
573     * Checks whether there are any previous tokens that can be iterated to.
574     *
575     * @return true if there are previous tokens
576     */
577    @Override
578    public boolean hasPrevious() {
579        checkTokenized();
580        return tokenPos > 0;
581    }
582
583    /**
584     * Gets the token previous to the last returned token.
585     *
586     * @return the previous token
587     */
588    @Override
589    public String previous() {
590        if (hasPrevious()) {
591            return tokens[--tokenPos];
592        }
593        throw new NoSuchElementException();
594    }
595
596    /**
597     * Gets the index of the previous token.
598     *
599     * @return the previous token index
600     */
601    @Override
602    public int previousIndex() {
603        return tokenPos - 1;
604    }
605
606    /**
607     * Unsupported ListIterator operation.
608     *
609     * @throws UnsupportedOperationException
610     *             always
611     */
612    @Override
613    public void remove() {
614        throw new UnsupportedOperationException("remove() is unsupported");
615    }
616
617    /**
618     * Unsupported ListIterator operation.
619     *
620     * @param obj
621     *            this parameter ignored.
622     * @throws UnsupportedOperationException
623     *             always
624     */
625    @Override
626    public void set(final String obj) {
627        throw new UnsupportedOperationException("set() is unsupported");
628    }
629
630    /**
631     * Unsupported ListIterator operation.
632     *
633     * @param obj
634     *            this parameter ignored.
635     * @throws UnsupportedOperationException
636     *             always
637     */
638    @Override
639    public void add(final String obj) {
640        throw new UnsupportedOperationException("add() is unsupported");
641    }
642
643    // Implementation
644    // -----------------------------------------------------------------------
645    /**
646     * Checks if tokenization has been done, and if not then do it.
647     */
648    private void checkTokenized() {
649        if (tokens == null) {
650            if (chars == null) {
651                // still call tokenize as subclass may do some work
652                final List<String> split = tokenize(null, 0, 0);
653                tokens = split.toArray(new String[split.size()]);
654            } else {
655                final List<String> split = tokenize(chars, 0, chars.length);
656                tokens = split.toArray(new String[split.size()]);
657            }
658        }
659    }
660
661    /**
662     * Internal method to performs the tokenization.
663     * <p>
664     * Most users of this class do not need to call this method. This method will be called automatically by other
665     * (public) methods when required.
666     * <p>
667     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
668     * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
669     * strings. It is also be possible to filter the results.
670     * <p>
671     * <code>StrTokenizer</code> will always pass a zero offset and a count equal to the length of the array to this
672     * method, however a subclass may pass other values, or even an entirely different array.
673     *
674     * @param srcChars
675     *            the character array being tokenized, may be null
676     * @param offset
677     *            the start position within the character array, must be valid
678     * @param count
679     *            the number of characters to tokenize, must be valid
680     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
681     */
682    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
683        if (srcChars == null || count == 0) {
684            return Collections.emptyList();
685        }
686        final TextStringBuilder buf = new TextStringBuilder();
687        final List<String> tokenList = new ArrayList<>();
688        int pos = offset;
689
690        // loop around the entire buffer
691        while (pos >= 0 && pos < count) {
692            // find next token
693            pos = readNextToken(srcChars, pos, count, buf, tokenList);
694
695            // handle case where end of string is a delimiter
696            if (pos >= count) {
697                addToken(tokenList, "");
698            }
699        }
700        return tokenList;
701    }
702
703    /**
704     * Adds a token to a list, paying attention to the parameters we've set.
705     *
706     * @param list
707     *            the list to add to
708     * @param tok
709     *            the token to add
710     */
711    private void addToken(final List<String> list, String tok) {
712        if (tok == null || tok.length() == 0) {
713            if (isIgnoreEmptyTokens()) {
714                return;
715            }
716            if (isEmptyTokenAsNull()) {
717                tok = null;
718            }
719        }
720        list.add(tok);
721    }
722
723    /**
724     * Reads character by character through the String to get the next token.
725     *
726     * @param srcChars
727     *            the character array being tokenized
728     * @param start
729     *            the first character of field
730     * @param len
731     *            the length of the character array being tokenized
732     * @param workArea
733     *            a temporary work area
734     * @param tokenList
735     *            the list of parsed tokens
736     * @return the starting position of the next field (the character immediately after the delimiter), or -1 if end of
737     *         string found
738     */
739    private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
740            final List<String> tokenList) {
741        // skip all leading whitespace, unless it is the
742        // field delimiter or the quote character
743        while (start < len) {
744            final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
745                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
746            if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
747                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
748                break;
749            }
750            start += removeLen;
751        }
752
753        // handle reaching end
754        if (start >= len) {
755            addToken(tokenList, "");
756            return -1;
757        }
758
759        // handle empty token
760        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
761        if (delimLen > 0) {
762            addToken(tokenList, "");
763            return start + delimLen;
764        }
765
766        // handle found token
767        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
768        if (quoteLen > 0) {
769            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
770        }
771        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
772    }
773
774    /**
775     * Reads a possibly quoted string token.
776     *
777     * @param srcChars
778     *            the character array being tokenized
779     * @param start
780     *            the first character of field
781     * @param len
782     *            the length of the character array being tokenized
783     * @param workArea
784     *            a temporary work area
785     * @param tokenList
786     *            the list of parsed tokens
787     * @param quoteStart
788     *            the start position of the matched quote, 0 if no quoting
789     * @param quoteLen
790     *            the length of the matched quote, 0 if no quoting
791     * @return the starting position of the next field (the character immediately after the delimiter, or if end of
792     *         string found, then the length of string
793     */
794    private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
795            final List<String> tokenList, final int quoteStart, final int quoteLen) {
796        // Loop until we've found the end of the quoted
797        // string or the end of the input
798        workArea.clear();
799        int pos = start;
800        boolean quoting = quoteLen > 0;
801        int trimStart = 0;
802
803        while (pos < len) {
804            // quoting mode can occur several times throughout a string
805            // we must switch between quoting and non-quoting until we
806            // encounter a non-quoted delimiter, or end of string
807            if (quoting) {
808                // In quoting mode
809
810                // If we've found a quote character, see if it's
811                // followed by a second quote. If so, then we need
812                // to actually put the quote character into the token
813                // rather than end the token.
814                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
815                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
816                        // matched pair of quotes, thus an escaped quote
817                        workArea.append(srcChars, pos, quoteLen);
818                        pos += quoteLen * 2;
819                        trimStart = workArea.size();
820                        continue;
821                    }
822
823                    // end of quoting
824                    quoting = false;
825                    pos += quoteLen;
826                    continue;
827                }
828
829                // copy regular character from inside quotes
830                workArea.append(srcChars[pos++]);
831                trimStart = workArea.size();
832
833            } else {
834                // Not in quoting mode
835
836                // check for delimiter, and thus end of token
837                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
838                if (delimLen > 0) {
839                    // return condition when end of token found
840                    addToken(tokenList, workArea.substring(0, trimStart));
841                    return pos + delimLen;
842                }
843
844                // check for quote, and thus back into quoting mode
845                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
846                    quoting = true;
847                    pos += quoteLen;
848                    continue;
849                }
850
851                // check for ignored (outside quotes), and ignore
852                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
853                if (ignoredLen > 0) {
854                    pos += ignoredLen;
855                    continue;
856                }
857
858                // check for trimmed character
859                // don't yet know if its at the end, so copy to workArea
860                // use trimStart to keep track of trim at the end
861                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
862                if (trimmedLen > 0) {
863                    workArea.append(srcChars, pos, trimmedLen);
864                    pos += trimmedLen;
865                    continue;
866                }
867
868                // copy regular character from outside quotes
869                workArea.append(srcChars[pos++]);
870                trimStart = workArea.size();
871            }
872        }
873
874        // return condition when end of string found
875        addToken(tokenList, workArea.substring(0, trimStart));
876        return -1;
877    }
878
879    /**
880     * Checks if the characters at the index specified match the quote already matched in readNextToken().
881     *
882     * @param srcChars
883     *            the character array being tokenized
884     * @param pos
885     *            the position to check for a quote
886     * @param len
887     *            the length of the character array being tokenized
888     * @param quoteStart
889     *            the start position of the matched quote, 0 if no quoting
890     * @param quoteLen
891     *            the length of the matched quote, 0 if no quoting
892     * @return true if a quote is matched
893     */
894    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
895            final int quoteLen) {
896        for (int i = 0; i < quoteLen; i++) {
897            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
898                return false;
899            }
900        }
901        return true;
902    }
903
904    // Delimiter
905    // -----------------------------------------------------------------------
906    /**
907     * Gets the field delimiter matcher.
908     *
909     * @return the delimiter matcher in use
910     */
911    public StringMatcher getDelimiterMatcher() {
912        return this.delimMatcher;
913    }
914
915    /**
916     * Sets the field delimiter matcher.
917     * <p>
918     * The delimiter is used to separate one token from another.
919     *
920     * @param delim
921     *            the delimiter matcher to use
922     * @return this, to enable chaining
923     */
924    public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
925        if (delim == null) {
926            this.delimMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
927        } else {
928            this.delimMatcher = delim;
929        }
930        return this;
931    }
932
933    /**
934     * Sets the field delimiter character.
935     *
936     * @param delim
937     *            the delimiter character to use
938     * @return this, to enable chaining
939     */
940    public StringTokenizer setDelimiterChar(final char delim) {
941        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
942    }
943
944    /**
945     * Sets the field delimiter string.
946     *
947     * @param delim
948     *            the delimiter string to use
949     * @return this, to enable chaining
950     */
951    public StringTokenizer setDelimiterString(final String delim) {
952        return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
953    }
954
955    // Quote
956    // -----------------------------------------------------------------------
957    /**
958     * Gets the quote matcher currently in use.
959     * <p>
960     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
961     * default value is '"' (double quote).
962     *
963     * @return the quote matcher in use
964     */
965    public StringMatcher getQuoteMatcher() {
966        return quoteMatcher;
967    }
968
969    /**
970     * Set the quote matcher to use.
971     * <p>
972     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
973     *
974     * @param quote
975     *            the quote matcher to use, null ignored
976     * @return this, to enable chaining
977     */
978    public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
979        if (quote != null) {
980            this.quoteMatcher = quote;
981        }
982        return this;
983    }
984
985    /**
986     * Sets the quote character to use.
987     * <p>
988     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
989     *
990     * @param quote
991     *            the quote character to use
992     * @return this, to enable chaining
993     */
994    public StringTokenizer setQuoteChar(final char quote) {
995        return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
996    }
997
998    // Ignored
999    // -----------------------------------------------------------------------
1000    /**
1001     * Gets the ignored character matcher.
1002     * <p>
1003     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
1004     * is not to ignore anything.
1005     *
1006     * @return the ignored matcher in use
1007     */
1008    public StringMatcher getIgnoredMatcher() {
1009        return ignoredMatcher;
1010    }
1011
1012    /**
1013     * Set the matcher for characters to ignore.
1014     * <p>
1015     * These characters are ignored when parsing the String, unless they are within a quoted region.
1016     *
1017     * @param ignored
1018     *            the ignored matcher to use, null ignored
1019     * @return this, to enable chaining
1020     */
1021    public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1022        if (ignored != null) {
1023            this.ignoredMatcher = ignored;
1024        }
1025        return this;
1026    }
1027
1028    /**
1029     * Set the character to ignore.
1030     * <p>
1031     * This character is ignored when parsing the String, unless it is within a quoted region.
1032     *
1033     * @param ignored
1034     *            the ignored character to use
1035     * @return this, to enable chaining
1036     */
1037    public StringTokenizer setIgnoredChar(final char ignored) {
1038        return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
1039    }
1040
1041    // Trimmer
1042    // -----------------------------------------------------------------------
1043    /**
1044     * Gets the trimmer character matcher.
1045     * <p>
1046     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
1047     * value is not to trim anything.
1048     *
1049     * @return the trimmer matcher in use
1050     */
1051    public StringMatcher getTrimmerMatcher() {
1052        return trimmerMatcher;
1053    }
1054
1055    /**
1056     * Sets the matcher for characters to trim.
1057     * <p>
1058     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1059     *
1060     * @param trimmer
1061     *            the trimmer matcher to use, null ignored
1062     * @return this, to enable chaining
1063     */
1064    public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1065        if (trimmer != null) {
1066            this.trimmerMatcher = trimmer;
1067        }
1068        return this;
1069    }
1070
1071    // -----------------------------------------------------------------------
1072    /**
1073     * Gets whether the tokenizer currently returns empty tokens as null. The default for this property is false.
1074     *
1075     * @return true if empty tokens are returned as null
1076     */
1077    public boolean isEmptyTokenAsNull() {
1078        return this.emptyAsNull;
1079    }
1080
1081    /**
1082     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
1083     *
1084     * @param emptyAsNull
1085     *            whether empty tokens are returned as null
1086     * @return this, to enable chaining
1087     */
1088    public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1089        this.emptyAsNull = emptyAsNull;
1090        return this;
1091    }
1092
1093    // -----------------------------------------------------------------------
1094    /**
1095     * Gets whether the tokenizer currently ignores empty tokens. The default for this property is true.
1096     *
1097     * @return true if empty tokens are not returned
1098     */
1099    public boolean isIgnoreEmptyTokens() {
1100        return ignoreEmptyTokens;
1101    }
1102
1103    /**
1104     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1105     *
1106     * @param ignoreEmptyTokens
1107     *            whether empty tokens are not returned
1108     * @return this, to enable chaining
1109     */
1110    public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1111        this.ignoreEmptyTokens = ignoreEmptyTokens;
1112        return this;
1113    }
1114
1115    // -----------------------------------------------------------------------
1116    /**
1117     * Gets the String content that the tokenizer is parsing.
1118     *
1119     * @return the string content being parsed
1120     */
1121    public String getContent() {
1122        if (chars == null) {
1123            return null;
1124        }
1125        return new String(chars);
1126    }
1127
1128    // -----------------------------------------------------------------------
1129    /**
1130     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1131     * list. If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1132     *
1133     * @return a new instance of this Tokenizer which has been reset.
1134     */
1135    @Override
1136    public Object clone() {
1137        try {
1138            return cloneReset();
1139        } catch (final CloneNotSupportedException ex) {
1140            return null;
1141        }
1142    }
1143
1144    /**
1145     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
1146     * list.
1147     *
1148     * @return a new instance of this Tokenizer which has been reset.
1149     * @throws CloneNotSupportedException
1150     *             if there is a problem cloning
1151     */
1152    Object cloneReset() throws CloneNotSupportedException {
1153        // this method exists to enable 100% test coverage
1154        final StringTokenizer cloned = (StringTokenizer) super.clone();
1155        if (cloned.chars != null) {
1156            cloned.chars = cloned.chars.clone();
1157        }
1158        cloned.reset();
1159        return cloned;
1160    }
1161
1162    // -----------------------------------------------------------------------
1163    /**
1164     * Gets the String content that the tokenizer is parsing.
1165     *
1166     * @return the string content being parsed
1167     */
1168    @Override
1169    public String toString() {
1170        if (tokens == null) {
1171            return "StringTokenizer[not tokenized yet]";
1172        }
1173        return "StringTokenizer" + getTokenList();
1174    }
1175
1176}