org.apache.commons.text.StringTokenizer Maven / Gradle / Ivy
Show all versions of commons-text Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.text;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.NoSuchElementException;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.matcher.StringMatcher;
import org.apache.commons.text.matcher.StringMatcherFactory;
/**
* Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
*
* This class can split a String into many smaller strings. It aims to do a similar job to
* {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
* implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
*
* The input String is split into a number of tokens. Each token is separated from the next String by a
* delimiter. One or more delimiter characters must be specified.
*
* Each token may be surrounded by quotes. The quote matcher specifies the quote character(s). A quote may be
* escaped within a quoted section by duplicating itself.
*
* Between each token and the delimiter are potentially characters that need trimming. The trimmer matcher
* specifies these characters. One usage might be to trim whitespace characters.
*
* At any point outside the quotes there might potentially be invalid characters. The ignored matcher specifies
* these characters to be removed. One usage might be to remove new line characters.
*
* Empty tokens may be removed or returned as null.
*
*
* "a,b,c" - Three tokens "a","b","c" (comma delimiter)
* " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
* "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
*
*
*
* StringTokenizer properties and options
*
* Property
* Type
* Default
*
*
* delim
* CharSetMatcher
* { \t\n\r\f}
*
*
* quote
* NoneMatcher
* {}
*
*
* ignore
* NoneMatcher
* {}
*
*
* emptyTokenAsNull
* boolean
* false
*
*
* ignoreEmptyTokens
* boolean
* true
*
*
*
* @since 1.3
*/
public class StringTokenizer implements ListIterator, Cloneable {
/** Comma separated values tokenizer internal variable. */
private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE;
/** Tab separated values tokenizer internal variable. */
private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE;
static {
CSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher());
CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
TSV_TOKENIZER_PROTOTYPE = new StringTokenizer();
TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher());
TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher());
TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher());
TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher());
TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
}
/**
* Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
*
* @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
*/
private static StringTokenizer getCSVClone() {
return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
}
/**
* Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
* The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
* setTrimmer method).
*
* You must call a "reset" method to set the string which you want to parse.
*
*
* @return a new tokenizer instance which parses Comma Separated Value strings
*/
public static StringTokenizer getCSVInstance() {
return getCSVClone();
}
/**
* Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
* The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
* setTrimmer method).
*
* @param input
* the text to parse
* @return a new tokenizer instance which parses Comma Separated Value strings
*/
public static StringTokenizer getCSVInstance(final char[] input) {
return getCSVClone().reset(input);
}
/**
* Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
* The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
* setTrimmer method).
*
* @param input
* the text to parse
* @return a new tokenizer instance which parses Comma Separated Value strings
*/
public static StringTokenizer getCSVInstance(final String input) {
return getCSVClone().reset(input);
}
/**
* Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
*
* @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
*/
private static StringTokenizer getTSVClone() {
return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
}
/**
* Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
* trim whitespace from both ends (which can be overridden with the setTrimmer method).
*
* You must call a "reset" method to set the string which you want to parse.
*
*
* @return a new tokenizer instance which parses Tab Separated Value strings.
*/
public static StringTokenizer getTSVInstance() {
return getTSVClone();
}
/**
* Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
* trim whitespace from both ends (which can be overridden with the setTrimmer method).
*
* @param input
* the string to parse
* @return a new tokenizer instance which parses Tab Separated Value strings.
*/
public static StringTokenizer getTSVInstance(final char[] input) {
return getTSVClone().reset(input);
}
/**
* Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
* trim whitespace from both ends (which can be overridden with the setTrimmer method).
*
* @param input
* the string to parse
* @return a new tokenizer instance which parses Tab Separated Value strings.
*/
public static StringTokenizer getTSVInstance(final String input) {
return getTSVClone().reset(input);
}
/** The text to work on. */
private char[] chars;
/** The parsed tokens. */
private String[] tokens;
/** The current iteration position. */
private int tokenPos;
/** The delimiter matcher. */
private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
/** The quote matcher. */
private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
/** The ignored matcher. */
private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
/** The trimmer matcher. */
private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
/** Whether to return empty tokens as null. */
private boolean emptyAsNull;
/** Whether to ignore empty tokens. */
private boolean ignoreEmptyTokens = true;
/**
* Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
* tokenize.
*
* This constructor is normally used with {@link #reset(String)}.
*
*/
public StringTokenizer() {
this.chars = null;
}
/**
* Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
*
* @param input
* the string which is to be parsed, not cloned
*/
public StringTokenizer(final char[] input) {
this.chars = input != null ? input.clone() : null;
}
/**
* Constructs a tokenizer splitting on the specified character.
*
* @param input
* the string which is to be parsed, not cloned
* @param delim
* the field delimiter character
*/
public StringTokenizer(final char[] input, final char delim) {
this(input);
setDelimiterChar(delim);
}
/**
* Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
* quote character.
*
* @param input
* the string which is to be parsed, not cloned
* @param delim
* the field delimiter character
* @param quote
* the field quoted string character
*/
public StringTokenizer(final char[] input, final char delim, final char quote) {
this(input, delim);
setQuoteChar(quote);
}
/**
* Constructs a tokenizer splitting on the specified string.
*
* @param input
* the string which is to be parsed, not cloned
* @param delim
* the field delimiter string
*/
public StringTokenizer(final char[] input, final String delim) {
this(input);
setDelimiterString(delim);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher.
*
* @param input
* the string which is to be parsed, not cloned
* @param delim
* the field delimiter matcher
*/
public StringTokenizer(final char[] input, final StringMatcher delim) {
this(input);
setDelimiterMatcher(delim);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
* quote matcher.
*
* @param input
* the string which is to be parsed, not cloned
* @param delim
* the field delimiter character
* @param quote
* the field quoted string character
*/
public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
this(input, delim);
setQuoteMatcher(quote);
}
/**
* Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
*
* @param input
* the string which is to be parsed
*/
public StringTokenizer(final String input) {
this.chars = input != null ? input.toCharArray() : null;
}
/**
* Constructs a tokenizer splitting on the specified delimiter character.
*
* @param input
* the string which is to be parsed
* @param delim
* the field delimiter character
*/
public StringTokenizer(final String input, final char delim) {
this(input);
setDelimiterChar(delim);
}
/**
* Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
* quote character.
*
* @param input
* the string which is to be parsed
* @param delim
* the field delimiter character
* @param quote
* the field quoted string character
*/
public StringTokenizer(final String input, final char delim, final char quote) {
this(input, delim);
setQuoteChar(quote);
}
/**
* Constructs a tokenizer splitting on the specified delimiter string.
*
* @param input
* the string which is to be parsed
* @param delim
* the field delimiter string
*/
public StringTokenizer(final String input, final String delim) {
this(input);
setDelimiterString(delim);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher.
*
* @param input
* the string which is to be parsed
* @param delim
* the field delimiter matcher
*/
public StringTokenizer(final String input, final StringMatcher delim) {
this(input);
setDelimiterMatcher(delim);
}
/**
* Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
* quote matcher.
*
* @param input
* the string which is to be parsed
* @param delim
* the field delimiter matcher
* @param quote
* the field quoted string matcher
*/
public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
this(input, delim);
setQuoteMatcher(quote);
}
/**
* Unsupported ListIterator operation.
*
* @param obj
* this parameter ignored.
* @throws UnsupportedOperationException
* always
*/
@Override
public void add(final String obj) {
throw new UnsupportedOperationException("add() is unsupported");
}
/**
* Adds a token to a list, paying attention to the parameters we've set.
*
* @param list
* the list to add to
* @param tok
* the token to add
*/
private void addToken(final List list, String tok) {
if (tok == null || tok.isEmpty()) {
if (isIgnoreEmptyTokens()) {
return;
}
if (isEmptyTokenAsNull()) {
tok = null;
}
}
list.add(tok);
}
/**
* Checks if tokenization has been done, and if not then do it.
*/
private void checkTokenized() {
if (tokens == null) {
final List split;
if (chars == null) {
// still call tokenize as subclass may do some work
split = tokenize(null, 0, 0);
} else {
split = tokenize(chars, 0, chars.length);
}
tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
}
}
/**
* Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
* list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
*
* @return a new instance of this Tokenizer which has been reset.
*/
@Override
public Object clone() {
try {
return cloneReset();
} catch (final CloneNotSupportedException ex) {
return null;
}
}
/**
* Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
* list.
*
* @return a new instance of this Tokenizer which has been reset.
* @throws CloneNotSupportedException
* if there is a problem cloning
*/
Object cloneReset() throws CloneNotSupportedException {
// this method exists to enable 100% test coverage
final StringTokenizer cloned = (StringTokenizer) super.clone();
if (cloned.chars != null) {
cloned.chars = cloned.chars.clone();
}
cloned.reset();
return cloned;
}
/**
* Gets the String content that the tokenizer is parsing.
*
* @return The string content being parsed
*/
public String getContent() {
if (chars == null) {
return null;
}
return new String(chars);
}
/**
* Gets the field delimiter matcher.
*
* @return The delimiter matcher in use
*/
public StringMatcher getDelimiterMatcher() {
return this.delimMatcher;
}
/**
* Gets the ignored character matcher.
*
* These characters are ignored when parsing the String, unless they are within a quoted region. The default value
* is not to ignore anything.
*
*
* @return The ignored matcher in use
*/
public StringMatcher getIgnoredMatcher() {
return ignoredMatcher;
}
/**
* Gets the quote matcher currently in use.
*
* The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
* default value is '"' (double quote).
*
*
* @return The quote matcher in use
*/
public StringMatcher getQuoteMatcher() {
return quoteMatcher;
}
/**
* Gets a copy of the full token list as an independent modifiable array.
*
* @return The tokens as a String array
*/
public String[] getTokenArray() {
checkTokenized();
return tokens.clone();
}
/**
* Gets a copy of the full token list as an independent modifiable list.
*
* @return The tokens as a String list
*/
public List getTokenList() {
checkTokenized();
return new ArrayList<>(Arrays.asList(tokens));
}
/**
* Gets the trimmer character matcher.
*
* These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
* value is not to trim anything.
*
*
* @return The trimmer matcher in use
*/
public StringMatcher getTrimmerMatcher() {
return trimmerMatcher;
}
/**
* Tests whether there are any more tokens.
*
* @return true if there are more tokens
*/
@Override
public boolean hasNext() {
checkTokenized();
return tokenPos < tokens.length;
}
/**
* Tests whether there are any previous tokens that can be iterated to.
*
* @return true if there are previous tokens
*/
@Override
public boolean hasPrevious() {
checkTokenized();
return tokenPos > 0;
}
/**
* Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
*
* @return true if empty tokens are returned as null
*/
public boolean isEmptyTokenAsNull() {
return this.emptyAsNull;
}
/**
* Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
*
* @return true if empty tokens are not returned
*/
public boolean isIgnoreEmptyTokens() {
return ignoreEmptyTokens;
}
/**
* Tests if the characters at the index specified match the quote already matched in readNextToken().
*
* @param srcChars
* the character array being tokenized
* @param pos
* the position to check for a quote
* @param len
* the length of the character array being tokenized
* @param quoteStart
* the start position of the matched quote, 0 if no quoting
* @param quoteLen
* the length of the matched quote, 0 if no quoting
* @return true if a quote is matched
*/
private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
final int quoteLen) {
for (int i = 0; i < quoteLen; i++) {
if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
return false;
}
}
return true;
}
/**
* Gets the next token.
*
* @return The next String token
* @throws NoSuchElementException
* if there are no more elements
*/
@Override
public String next() {
if (hasNext()) {
return tokens[tokenPos++];
}
throw new NoSuchElementException();
}
/**
* Gets the index of the next token to return.
*
* @return The next token index
*/
@Override
public int nextIndex() {
return tokenPos;
}
/**
* Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
* {@link NoSuchElementException} when no tokens remain.
*
* @return The next sequential token, or null when no more tokens are found
*/
public String nextToken() {
if (hasNext()) {
return tokens[tokenPos++];
}
return null;
}
/**
* Gets the token previous to the last returned token.
*
* @return The previous token
*/
@Override
public String previous() {
if (hasPrevious()) {
return tokens[--tokenPos];
}
throw new NoSuchElementException();
}
/**
* Gets the index of the previous token.
*
* @return The previous token index
*/
@Override
public int previousIndex() {
return tokenPos - 1;
}
/**
* Gets the previous token from the String.
*
* @return The previous sequential token, or null when no more tokens are found
*/
public String previousToken() {
if (hasPrevious()) {
return tokens[--tokenPos];
}
return null;
}
/**
* Reads character by character through the String to get the next token.
*
* @param srcChars
* the character array being tokenized
* @param start
* the first character of field
* @param len
* the length of the character array being tokenized
* @param workArea
* a temporary work area
* @param tokenList
* the list of parsed tokens
* @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
* string found
*/
private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
final List tokenList) {
// skip all leading whitespace, unless it is the
// field delimiter or the quote character
while (start < len) {
final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
getTrimmerMatcher().isMatch(srcChars, start, start, len));
if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
|| getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
break;
}
start += removeLen;
}
// handle reaching end
if (start >= len) {
addToken(tokenList, StringUtils.EMPTY);
return -1;
}
// handle empty token
final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
if (delimLen > 0) {
addToken(tokenList, StringUtils.EMPTY);
return start + delimLen;
}
// handle found token
final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
if (quoteLen > 0) {
return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
}
return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
}
/**
* Reads a possibly quoted string token.
*
* @param srcChars
* the character array being tokenized
* @param start
* the first character of field
* @param len
* the length of the character array being tokenized
* @param workArea
* a temporary work area
* @param tokenList
* the list of parsed tokens
* @param quoteStart
* the start position of the matched quote, 0 if no quoting
* @param quoteLen
* the length of the matched quote, 0 if no quoting
* @return The starting position of the next field (the character immediately after the delimiter, or if end of
* string found, then the length of string
*/
private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
final List tokenList, final int quoteStart, final int quoteLen) {
// Loop until we've found the end of the quoted
// string or the end of the input
workArea.clear();
int pos = start;
boolean quoting = quoteLen > 0;
int trimStart = 0;
while (pos < len) {
// quoting mode can occur several times throughout a string
// we must switch between quoting and non-quoting until we
// encounter a non-quoted delimiter, or end of string
if (quoting) {
// In quoting mode
// If we've found a quote character, see if it's
// followed by a second quote. If so, then we need
// to actually put the quote character into the token
// rather than end the token.
if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
// matched pair of quotes, thus an escaped quote
workArea.append(srcChars, pos, quoteLen);
pos += quoteLen * 2;
trimStart = workArea.size();
continue;
}
// end of quoting
quoting = false;
pos += quoteLen;
continue;
}
} else {
// Not in quoting mode
// check for delimiter, and thus end of token
final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
if (delimLen > 0) {
// return condition when end of token found
addToken(tokenList, workArea.substring(0, trimStart));
return pos + delimLen;
}
// check for quote, and thus back into quoting mode
if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
quoting = true;
pos += quoteLen;
continue;
}
// check for ignored (outside quotes), and ignore
final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
if (ignoredLen > 0) {
pos += ignoredLen;
continue;
}
// check for trimmed character
// don't yet know if its at the end, so copy to workArea
// use trimStart to keep track of trim at the end
final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
if (trimmedLen > 0) {
workArea.append(srcChars, pos, trimmedLen);
pos += trimmedLen;
continue;
}
}
// copy regular character from inside quotes
workArea.append(srcChars[pos++]);
trimStart = workArea.size();
}
// return condition when end of string found
addToken(tokenList, workArea.substring(0, trimStart));
return -1;
}
/**
* Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
*
* @throws UnsupportedOperationException
* always
*/
@Override
public void remove() {
throw new UnsupportedOperationException("remove() is unsupported");
}
/**
* Resets this tokenizer, forgetting all parsing and iteration already completed.
*
* This method allows the same tokenizer to be reused for the same String.
*
*
* @return this, to enable chaining
*/
public StringTokenizer reset() {
tokenPos = 0;
tokens = null;
return this;
}
/**
* Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
* same settings on multiple input lines.
*
* @param input
* the new character array to tokenize, not cloned, null sets no text to parse
* @return this, to enable chaining
*/
public StringTokenizer reset(final char[] input) {
reset();
this.chars = input != null ? input.clone() : null;
return this;
}
/**
* Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
* same settings on multiple input lines.
*
* @param input
* the new string to tokenize, null sets no text to parse
* @return this, to enable chaining
*/
public StringTokenizer reset(final String input) {
reset();
this.chars = input != null ? input.toCharArray() : null;
return this;
}
/**
* Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
*
* @param obj
* this parameter ignored.
* @throws UnsupportedOperationException
* always
*/
@Override
public void set(final String obj) {
throw new UnsupportedOperationException("set() is unsupported");
}
/**
* Sets the field delimiter character.
*
* @param delim
* the delimiter character to use
* @return this, to enable chaining
*/
public StringTokenizer setDelimiterChar(final char delim) {
return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
}
/**
* Sets the field delimiter matcher.
*
* The delimiter is used to separate one token from another.
*
*
* @param delim
* the delimiter matcher to use
* @return this, to enable chaining
*/
public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
return this;
}
/**
* Sets the field delimiter string.
*
* @param delim
* the delimiter string to use
* @return this, to enable chaining
*/
public StringTokenizer setDelimiterString(final String delim) {
return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
}
/**
* Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
*
* @param emptyAsNull
* whether empty tokens are returned as null
* @return this, to enable chaining
*/
public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
this.emptyAsNull = emptyAsNull;
return this;
}
/**
* Sets the character to ignore.
*
* This character is ignored when parsing the String, unless it is within a quoted region.
*
*
* @param ignored
* the ignored character to use
* @return this, to enable chaining
*/
public StringTokenizer setIgnoredChar(final char ignored) {
return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
}
/**
* Sets the matcher for characters to ignore.
*
* These characters are ignored when parsing the String, unless they are within a quoted region.
*
*
* @param ignored
* the ignored matcher to use, null ignored
* @return this, to enable chaining
*/
public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
if (ignored != null) {
this.ignoredMatcher = ignored;
}
return this;
}
/**
* Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
*
* @param ignoreEmptyTokens
* whether empty tokens are not returned
* @return this, to enable chaining
*/
public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
this.ignoreEmptyTokens = ignoreEmptyTokens;
return this;
}
/**
* Sets the quote character to use.
*
* The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
*
*
* @param quote
* the quote character to use
* @return this, to enable chaining
*/
public StringTokenizer setQuoteChar(final char quote) {
return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
}
/**
* Sets the quote matcher to use.
*
* The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
*
*
* @param quote
* the quote matcher to use, null ignored
* @return this, to enable chaining
*/
public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
if (quote != null) {
this.quoteMatcher = quote;
}
return this;
}
/**
* Sets the matcher for characters to trim.
*
* These characters are trimmed off on each side of the delimiter until the token or quote is found.
*
* @param trimmer
* the trimmer matcher to use, null ignored
* @return this, to enable chaining
*/
public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
if (trimmer != null) {
this.trimmerMatcher = trimmer;
}
return this;
}
/**
* Gets the number of tokens found in the String.
*
* @return The number of matched tokens
*/
public int size() {
checkTokenized();
return tokens.length;
}
/**
* Internal method to performs the tokenization.
*
* Most users of this class do not need to call this method. This method will be called automatically by other
* (public) methods when required.
*
*
* This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
* could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
* strings. It is also be possible to filter the results.
*
*
* {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
* method, however a subclass may pass other values, or even an entirely different array.
*
*
* @param srcChars
* the character array being tokenized, may be null
* @param offset
* the start position within the character array, must be valid
* @param count
* the number of characters to tokenize, must be valid
* @return The modifiable list of String tokens, unmodifiable if null array or zero count
*/
protected List tokenize(final char[] srcChars, final int offset, final int count) {
if (srcChars == null || count == 0) {
return Collections.emptyList();
}
final TextStringBuilder buf = new TextStringBuilder();
final List tokenList = new ArrayList<>();
int pos = offset;
// loop around the entire buffer
while (pos >= 0 && pos < count) {
// find next token
pos = readNextToken(srcChars, pos, count, buf, tokenList);
// handle case where end of string is a delimiter
if (pos >= count) {
addToken(tokenList, StringUtils.EMPTY);
}
}
return tokenList;
}
/**
* Gets the String content that the tokenizer is parsing.
*
* @return The string content being parsed
*/
@Override
public String toString() {
if (tokens == null) {
return "StringTokenizer[not tokenized yet]";
}
return "StringTokenizer" + getTokenList();
}
}