All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.util.StringTokenizer Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/**
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/

package com.ibm.icu.util;

import java.util.Enumeration;
import java.util.NoSuchElementException;

import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * {@icuenhanced java.util.Calendar}.{@icu _usage_}
 *
 * 

The string tokenizer class allows an application to break a string * into tokens by performing code point comparison. * The StringTokenizer methods do not distinguish * among identifiers, numbers, and quoted strings, nor do they recognize * and skip comments.

*

* The set of delimiters (the codepoints that separate tokens) may be * specified either at creation time or on a per-token basis. *

*

* An instance of StringTokenizer behaves in one of three ways, * depending on whether it was created with the returnDelims * and coalesceDelims * flags having the value true or false: *

    *
  • If returnDelims is false, delimiter code points serve to * separate tokens. A token is a maximal sequence of consecutive * code points that are not delimiters. *
  • If returnDelims is true, delimiter code points are * themselves considered to be tokens. In this case, if coalesceDelims is * true, such tokens will be the maximal sequence of consecutive * code points that are delimiters. If coalesceDelims is false, * a token will be received for each delimiter code point. *
*

A token is thus either one * delimiter code point, a maximal sequence of consecutive code points that * are delimiters, or a maximal sequence of consecutive code * points that are not delimiters. *

*

* A StringTokenizer object internally maintains a current * position within the string to be tokenized. Some operations advance this * current position past the code point processed. *

*

* A token is returned by taking a substring of the string that was used to * create the StringTokenizer object. *

*

* Example of the use of the default delimiter tokenizer. *

 * StringTokenizer st = new StringTokenizer("this is a test");
 * while (st.hasMoreTokens()) {
 *     println(st.nextToken());
 *     }
 * 
*

*

* prints the following output: *

 *     this
 *     is
 *     a
 *     test
 * 
*

*

* Example of the use of the tokenizer with user specified delimiter. *

 *     StringTokenizer st = new StringTokenizer(
 *     "this is a test with supplementary characters \ud800\ud800\udc00\udc00",
 *         " \ud800\udc00");
 *     while (st.hasMoreTokens()) {
 *         println(st.nextToken());
 *     }
 * 
*

*

* prints the following output: *

 *     this
 *     is
 *     a
 *     test
 *     with
 *     supplementary
 *     characters
 *     \ud800
 *     \udc00
 * 
*

* @author syn wee * @stable ICU 2.4 */ public final class StringTokenizer implements Enumeration { // public constructors --------------------------------------------- /** * {@icu} Constructs a string tokenizer for the specified string. All * characters in the delim argument are the delimiters for separating * tokens. *

If the returnDelims flag is false, the delimiter characters are * skipped and only serve as separators between tokens.

*

If the returnDelims flag is true, then the delimiter characters * are also returned as tokens, one per delimiter. * @param str a string to be parsed. * @param delim the delimiters. * @param returndelims flag indicating whether to return the delimiters * as tokens. * @exception NullPointerException if str is null * @stable ICU 2.4 */ public StringTokenizer(String str, UnicodeSet delim, boolean returndelims) { this(str, delim, returndelims, false); } /** * {@icu} Constructs a string tokenizer for the specified string. All * characters in the delim argument are the delimiters for separating * tokens. *

If the returnDelims flag is false, the delimiter characters are * skipped and only serve as separators between tokens.

*

If the returnDelims flag is true, then the delimiter characters * are also returned as tokens. If coalescedelims is true, one token * is returned for each run of delimiter characters, otherwise one * token is returned per delimiter. Since surrogate pairs can be * delimiters, the returned token might be two chars in length.

* @param str a string to be parsed. * @param delim the delimiters. * @param returndelims flag indicating whether to return the delimiters * as tokens. * @param coalescedelims flag indicating whether to return a run of * delimiters as a single token or as one token per delimiter. * This only takes effect if returndelims is true. * @exception NullPointerException if str is null * @internal * @deprecated This API is ICU internal only. */ public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims) { m_source_ = str; m_length_ = str.length(); if (delim == null) { m_delimiters_ = EMPTY_DELIMITER_; } else { m_delimiters_ = delim; } m_returnDelimiters_ = returndelims; m_coalesceDelimiters_ = coalescedelims; m_tokenOffset_ = -1; m_tokenSize_ = -1; if (m_length_ == 0) { // string length 0, no tokens m_nextOffset_ = -1; } else { m_nextOffset_ = 0; if (!returndelims) { m_nextOffset_ = getNextNonDelimiter(0); } } } /** * {@icu} Constructs a string tokenizer for the specified string. The * characters in the delim argument are the delimiters for separating * tokens. *

Delimiter characters themselves will not be treated as tokens.

* @param str a string to be parsed. * @param delim the delimiters. * @exception NullPointerException if str is null * @stable ICU 2.4 */ public StringTokenizer(String str, UnicodeSet delim) { this(str, delim, false, false); } /** *

Constructs a string tokenizer for the specified string. All * characters in the delim argument are the delimiters for separating * tokens.

*

If the returnDelims flag is false, the delimiter characters are * skipped and only serve as separators between tokens.

*

If the returnDelims flag is true, then the delimiter characters * are also returned as tokens, one per delimiter. * @param str a string to be parsed. * @param delim the delimiters. * @param returndelims flag indicating whether to return the delimiters * as tokens. * @exception NullPointerException if str is null * @stable ICU 2.4 */ public StringTokenizer(String str, String delim, boolean returndelims) { this(str, delim, returndelims, false); // java default behavior } /** *

Constructs a string tokenizer for the specified string. All * characters in the delim argument are the delimiters for separating * tokens.

*

If the returnDelims flag is false, the delimiter characters are * skipped and only serve as separators between tokens.

*

If the returnDelims flag is true, then the delimiter characters * are also returned as tokens. If coalescedelims is true, one token * is returned for each run of delimiter characters, otherwise one * token is returned per delimiter. Since surrogate pairs can be * delimiters, the returned token might be two chars in length.

* @param str a string to be parsed. * @param delim the delimiters. * @param returndelims flag indicating whether to return the delimiters * as tokens. * @param coalescedelims flag indicating whether to return a run of * delimiters as a single token or as one token per delimiter. * This only takes effect if returndelims is true. * @exception NullPointerException if str is null * @internal * @deprecated This API is ICU internal only. */ public StringTokenizer(String str, String delim, boolean returndelims, boolean coalescedelims) { // don't ignore whitespace m_delimiters_ = EMPTY_DELIMITER_; if (delim != null && delim.length() > 0) { m_delimiters_ = new UnicodeSet(); m_delimiters_.addAll(delim); checkDelimiters(); } m_coalesceDelimiters_ = coalescedelims; m_source_ = str; m_length_ = str.length(); m_returnDelimiters_ = returndelims; m_tokenOffset_ = -1; m_tokenSize_ = -1; if (m_length_ == 0) { // string length 0, no tokens m_nextOffset_ = -1; } else { m_nextOffset_ = 0; if (!returndelims) { m_nextOffset_ = getNextNonDelimiter(0); } } } /** *

Constructs a string tokenizer for the specified string. The * characters in the delim argument are the delimiters for separating * tokens.

*

Delimiter characters themselves will not be treated as tokens.

* @param str a string to be parsed. * @param delim the delimiters. * @exception NullPointerException if str is null * @stable ICU 2.4 */ public StringTokenizer(String str, String delim) { // don't ignore whitespace this(str, delim, false, false); } /** *

Constructs a string tokenizer for the specified string. * The tokenizer uses the default delimiter set, which is * " \t\n\r\f": * the space character, the tab character, the newline character, the * carriage-return character, and the form-feed character.

*

Delimiter characters themselves will not be treated as tokens.

* @param str a string to be parsed * @exception NullPointerException if str is null * @stable ICU 2.4 */ public StringTokenizer(String str) { this(str, DEFAULT_DELIMITERS_, false, false); } // public methods -------------------------------------------------- /** * Tests if there are more tokens available from this tokenizer's * string. * If this method returns true, then a subsequent call to * nextToken with no argument will successfully return a token. * @return true if and only if there is at least one token * in the string after the current position; false * otherwise. * @stable ICU 2.4 */ public boolean hasMoreTokens() { return m_nextOffset_ >= 0; } /** * Returns the next token from this string tokenizer. * @return the next token from this string tokenizer. * @exception NoSuchElementException if there are no more tokens in * this tokenizer's string. * @stable ICU 2.4 */ public String nextToken() { if (m_tokenOffset_ < 0) { if (m_nextOffset_ < 0) { throw new NoSuchElementException("No more tokens in String"); } // pre-calculations of tokens not done if (m_returnDelimiters_) { int tokenlimit = 0; int c = UTF16.charAt(m_source_, m_nextOffset_); boolean contains = delims == null ? m_delimiters_.contains(c) : c < delims.length && delims[c]; if (contains) { if (m_coalesceDelimiters_) { tokenlimit = getNextNonDelimiter(m_nextOffset_); } else { tokenlimit = m_nextOffset_ + UTF16.getCharCount(c); if (tokenlimit == m_length_) { tokenlimit = -1; } } } else { tokenlimit = getNextDelimiter(m_nextOffset_); } String result; if (tokenlimit < 0) { result = m_source_.substring(m_nextOffset_); } else { result = m_source_.substring(m_nextOffset_, tokenlimit); } m_nextOffset_ = tokenlimit; return result; } else { int tokenlimit = getNextDelimiter(m_nextOffset_); String result; if (tokenlimit < 0) { result = m_source_.substring(m_nextOffset_); m_nextOffset_ = tokenlimit; } else { result = m_source_.substring(m_nextOffset_, tokenlimit); m_nextOffset_ = getNextNonDelimiter(tokenlimit); } return result; } } // count was called before and we have all the tokens if (m_tokenOffset_ >= m_tokenSize_) { throw new NoSuchElementException("No more tokens in String"); } String result; if (m_tokenLimit_[m_tokenOffset_] >= 0) { result = m_source_.substring(m_tokenStart_[m_tokenOffset_], m_tokenLimit_[m_tokenOffset_]); } else { result = m_source_.substring(m_tokenStart_[m_tokenOffset_]); } m_tokenOffset_ ++; m_nextOffset_ = -1; if (m_tokenOffset_ < m_tokenSize_) { m_nextOffset_ = m_tokenStart_[m_tokenOffset_]; } return result; } /** * Returns the next token in this string tokenizer's string. First, * the set of characters considered to be delimiters by this * StringTokenizer object is changed to be the characters in * the string delim. Then the next token in the string * after the current position is returned. The current position is * advanced beyond the recognized token. The new delimiter set * remains the default after this call. * @param delim the new delimiters. * @return the next token, after switching to the new delimiter set. * @exception NoSuchElementException if there are no more tokens in * this tokenizer's string. * @stable ICU 2.4 */ public String nextToken(String delim) { m_delimiters_ = EMPTY_DELIMITER_; if (delim != null && delim.length() > 0) { m_delimiters_ = new UnicodeSet(); m_delimiters_.addAll(delim); } return nextToken(m_delimiters_); } /** * {@icu} Returns the next token in this string tokenizer's string. First, * the set of characters considered to be delimiters by this * StringTokenizer object is changed to be the characters in * the string delim. Then the next token in the string * after the current position is returned. The current position is * advanced beyond the recognized token. The new delimiter set * remains the default after this call. * @param delim the new delimiters. * @return the next token, after switching to the new delimiter set. * @exception NoSuchElementException if there are no more tokens in * this tokenizer's string. * @stable ICU 2.4 */ public String nextToken(UnicodeSet delim) { m_delimiters_ = delim; checkDelimiters(); m_tokenOffset_ = -1; m_tokenSize_ = -1; if (!m_returnDelimiters_) { m_nextOffset_ = getNextNonDelimiter(m_nextOffset_); } return nextToken(); } /** * Returns the same value as the hasMoreTokens method. * It exists so that this class can implement the * Enumeration interface. * @return true if there are more tokens; * false otherwise. * @see #hasMoreTokens() * @stable ICU 2.4 */ public boolean hasMoreElements() { return hasMoreTokens(); } /** * Returns the same value as the nextToken method, except * that its declared return value is Object rather than * String. It exists so that this class can implement the * Enumeration interface. * @return the next token in the string. * @exception NoSuchElementException if there are no more tokens in * this tokenizer's string. * @see #nextToken() * @stable ICU 2.4 */ public Object nextElement() { return nextToken(); } /** * Calculates the number of times that this tokenizer's * nextToken method can be called before it generates an * exception. The current position is not advanced. * @return the number of tokens remaining in the string using the * current delimiter set. * @see #nextToken() * @stable ICU 2.4 */ public int countTokens() { int result = 0; if (hasMoreTokens()) { if (m_tokenOffset_ >= 0) { return m_tokenSize_ - m_tokenOffset_; } if (m_tokenStart_ == null) { m_tokenStart_ = new int[TOKEN_SIZE_]; m_tokenLimit_ = new int[TOKEN_SIZE_]; } do { if (m_tokenStart_.length == result) { int temptokenindex[] = m_tokenStart_; int temptokensize[] = m_tokenLimit_; int originalsize = temptokenindex.length; int newsize = originalsize + TOKEN_SIZE_; m_tokenStart_ = new int[newsize]; m_tokenLimit_ = new int[newsize]; System.arraycopy(temptokenindex, 0, m_tokenStart_, 0, originalsize); System.arraycopy(temptokensize, 0, m_tokenLimit_, 0, originalsize); } m_tokenStart_[result] = m_nextOffset_; if (m_returnDelimiters_) { int c = UTF16.charAt(m_source_, m_nextOffset_); boolean contains = delims == null ? m_delimiters_.contains(c) : c < delims.length && delims[c]; if (contains) { if (m_coalesceDelimiters_) { m_tokenLimit_[result] = getNextNonDelimiter( m_nextOffset_); } else { int p = m_nextOffset_ + 1; if (p == m_length_) { p = -1; } m_tokenLimit_[result] = p; } } else { m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_); } m_nextOffset_ = m_tokenLimit_[result]; } else { m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_); m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]); } result ++; } while (m_nextOffset_ >= 0); m_tokenOffset_ = 0; m_tokenSize_ = result; m_nextOffset_ = m_tokenStart_[0]; } return result; } // private data members ------------------------------------------------- /** * Current offset to the token array. If the array token is not set up yet, * this value is a -1 */ private int m_tokenOffset_; /** * Size of the token array. If the array token is not set up yet, * this value is a -1 */ private int m_tokenSize_; /** * Array of pre-calculated tokens start indexes in source string terminated * by -1. * This is only set up during countTokens() and only stores the remaining * tokens, not all tokens including parsed ones */ private int m_tokenStart_[]; /** * Array of pre-calculated tokens limit indexes in source string. * This is only set up during countTokens() and only stores the remaining * tokens, not all tokens including parsed ones */ private int m_tokenLimit_[]; /** * UnicodeSet containing delimiters */ private UnicodeSet m_delimiters_; /** * String to parse for tokens */ private String m_source_; /** * Length of m_source_ */ private int m_length_; /** * Current position in string to parse for tokens */ private int m_nextOffset_; /** * Flag indicator if delimiters are to be treated as tokens too */ private boolean m_returnDelimiters_; /** * Flag indicating whether to coalesce runs of delimiters into single tokens */ private boolean m_coalesceDelimiters_; /** * Default set of delimiters \t\n\r\f */ private static final UnicodeSet DEFAULT_DELIMITERS_ = new UnicodeSet(0x09, 0x0a, 0x0c, 0x0d, 0x20, 0x20); // UnicodeSet("[ \t\n\r\f]", false) /** * Array size increments */ private static final int TOKEN_SIZE_ = 100; /** * A empty delimiter UnicodeSet, used when user specified null delimiters */ private static final UnicodeSet EMPTY_DELIMITER_ = UnicodeSet.EMPTY; // private methods ------------------------------------------------------ /** * Gets the index of the next delimiter after offset * @param offset to the source string * @return offset of the immediate next delimiter, otherwise * (- source string length - 1) if there * are no more delimiters after m_nextOffset */ private int getNextDelimiter(int offset) { if (offset >= 0) { int result = offset; int c = 0; if (delims == null) { do { c = UTF16.charAt(m_source_, result); if (m_delimiters_.contains(c)) { break; } result ++; } while (result < m_length_); } else { do { c = UTF16.charAt(m_source_, result); if (c < delims.length && delims[c]) { break; } result ++; } while (result < m_length_); } if (result < m_length_) { return result; } } return -1 - m_length_; } /** * Gets the index of the next non-delimiter after m_nextOffset_ * @param offset to the source string * @return offset of the immediate next non-delimiter, otherwise * (- source string length - 1) if there * are no more delimiters after m_nextOffset */ private int getNextNonDelimiter(int offset) { if (offset >= 0) { int result = offset; int c = 0; if (delims == null) { do { c = UTF16.charAt(m_source_, result); if (!m_delimiters_.contains(c)) { break; } result ++; } while (result < m_length_); } else { do { c = UTF16.charAt(m_source_, result); if (!(c < delims.length && delims[c])) { break; } result ++; } while (result < m_length_); } if (result < m_length_) { return result; } } return -1 - m_length_; } void checkDelimiters() { if (m_delimiters_ == null || m_delimiters_.size() == 0) { delims = new boolean[0]; } else { int maxChar = m_delimiters_.getRangeEnd(m_delimiters_.getRangeCount()-1); if (maxChar < 0x7f) { delims = new boolean[maxChar+1]; for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) { delims[ch] = true; } } else { delims = null; } } } private boolean[] delims; }