
com.gitee.melin.bee.text.TokenStream Maven / Gradle / Ivy
package com.gitee.melin.bee.text;
import java.util.*;
import javax.annotation.concurrent.Immutable;
import javax.annotation.concurrent.NotThreadSafe;
/** A stream of tokens that can be consumed by a parser. This class is not thread-safe. */
@NotThreadSafe
public class TokenStream {
/**
* An opaque marker for a position within the token stream.
*
* @see TokenStream#mark()
*/
public static final class Marker implements Comparable {
private final int tokenIndex;
private final Position position;
private Marker(Position position, int index) {
this.position = position;
this.tokenIndex = index;
}
/**
* Get the position of this marker, or null if this is at the start or end of the token
* stream.
*
* @return the position.
*/
public Position position() {
return position;
}
@Override
public int compareTo(Marker that) {
if (this == that) {
return 0;
}
return this.tokenIndex - that.tokenIndex;
}
@Override
public String toString() {
return Integer.toString(tokenIndex);
}
}
public static final String ANY_VALUE = "any value";
public static final int ANY_TYPE = Integer.MIN_VALUE;
protected final String inputString;
private final char[] inputContent;
private final boolean caseSensitive;
private final Tokenizer tokenizer;
private List tokens;
/**
* This class navigates the Token objects using this iterator. However, because it very often
* needs to access the "current token" in the "consume(...)" and "canConsume(...)" and
* "matches(...)" methods, the class caches a "current token" and makes this iterator point to
* the 2nd token.
*
*
* T1 T2 T3 T4 T5
* ˆ ˆ ˆ
* | | |
* | | +- The position of the tokenIterator, where tokenIterator.hasNext() will return T3
* | +---- The token referenced by currentToken
* +-------- The logical position of the TokenStream object, where the "consume()" would return T2
*
*/
private ListIterator tokenIterator;
private Token currentToken;
private boolean completed;
public TokenStream(String content, Tokenizer tokenizer, boolean caseSensitive) {
Objects.requireNonNull(content, "content");
Objects.requireNonNull(tokenizer, "tokenizer");
this.inputString = content;
this.inputContent = content.toCharArray();
this.caseSensitive = caseSensitive;
this.tokenizer = tokenizer;
}
/**
* Begin the token stream, including (if required) the tokenization of the input content.
*
* @return this object for easy method chaining; never null
* @throws ParsingException if an error occurs during tokenization of the content
*/
public TokenStream start() throws ParsingException {
// Create the tokens ...
if (tokens == null) {
TokenFactory tokenFactory =
caseSensitive ? new CaseSensitiveTokenFactory() : new CaseInsensitiveTokenFactory();
CharacterStream characterStream = new CharacterArrayStream(inputContent);
tokenizer.tokenize(characterStream, tokenFactory);
this.tokens = initializeTokens(tokenFactory.getTokens());
}
// Create the iterator ...
tokenIterator = this.tokens.listIterator();
moveToNextToken();
return this;
}
/**
* Method to allow subclasses to pre-process the set of tokens and return the correct tokens to
* use. The default behavior is to simply return the supplied tokens.
*
* @param tokens the tokens
* @return list of tokens.
*/
protected List initializeTokens(List tokens) {
return tokens;
}
/**
* Obtain a marker that records the current position so that the stream can be {@link
* #rewind(Marker)} back to the mark even after having been advanced beyond the mark.
*
* @return the marker; never null
* @throws IllegalStateException if this method was called before the stream was {@link #start()
* started}
* @throws NoSuchElementException if there are no more tokens
*/
public Marker mark() {
if (completed) {
return new Marker(null, tokenIterator.previousIndex());
}
Token currentToken = currentToken();
Position currentPosition = currentToken != null ? currentToken.position() : null;
return new Marker(currentPosition, tokenIterator.previousIndex());
}
/**
* Reset the stream back to the position described by the supplied marker. This method does
* nothing if the mark is invalid. For example, it is not possible to advance the token stream
* beyond the current position.
*
* @param marker the marker
* @return true if the token stream was reset, or false if the marker was invalid
*/
public boolean rewind(Marker marker) {
if (marker.tokenIndex >= 0 && marker.tokenIndex <= this.tokenIterator.nextIndex()) {
completed = false;
currentToken = null;
tokenIterator = this.tokens.listIterator(marker.tokenIndex);
moveToNextToken();
return true;
}
return false;
}
/**
* Return the value of this token and move to the next token.
*
* @return the value of the current token
* @throws ParsingException if there is no such token to consume
* @throws IllegalStateException if this method was called before the stream was {@link #start()
* started}
*/
public String consume() throws ParsingException, IllegalStateException {
if (completed) {
throwNoMoreContent();
}
// Get the value from the current token ...
String result = currentToken().value();
moveToNextToken();
return result;
}
protected void throwNoMoreContent() throws ParsingException {
Position pos = tokens.isEmpty()
? new Position(-1, 1, 0)
: tokens.get(tokens.size() - 1).position();
throw new ParsingException(pos, "No more content");
}
public String peek() throws IllegalStateException {
if (completed) {
throwNoMoreContent();
}
// Get the value from the current token but do NOT advance ...
return currentToken().value();
}
/**
* Determine if the current token matches the expected value.
*
* The {@link #ANY_VALUE ANY_VALUE} constant can be used as a wildcard.
*
* @param expected the expected value of the current token
* @return true if the current token did match, or false if the current token did not match
* @throws IllegalStateException if this method was called before the stream was {@link #start()
* started}
*/
public boolean matches(String expected) throws IllegalStateException {
return matches(ANY_TYPE, expected);
}
/**
* Determine if the current token matches the expected type and a value.
*
*
The {@link #ANY_VALUE ANY_VALUE} constant can be used as a wildcard.
*
* @param type the expected type of the curent token
* @param expected the expected value of the current token
* @return true if the current token did match, or false if the current token did not match
* @throws IllegalStateException if this method was called before the stream was {@link #start()
* started}
*/
public boolean matches(int type, String expected) throws IllegalStateException {
return !completed
&& (Objects.equals(expected, ANY_VALUE) || currentToken().matches(expected))
&& currentToken().matches(type);
}
/**
* Determine if the next token matches one of the supplied values.
*
* @param options the options for the value of the current token
* @return true if the current token's value did match one of the supplied options, or false
* otherwise
* @throws IllegalStateException if this method was called before the stream was {@link #start()
* started}
*/
public boolean matchesAnyOf(String[] options) throws IllegalStateException {
if (completed) {
return false;
}
Token current = currentToken();
for (String option : options) {
if (current.matches(option)) {
return true;
}
}
return false;
}
/**
* Determine if this stream has another token to be consumed.
*
* @return true if there is another token ready for consumption, or false otherwise
* @throws IllegalStateException if this method was called before the stream was {@link #start()
* started}
*/
public boolean hasNext() {
if (tokenIterator == null) {
throw new IllegalStateException("start() method must be called before hasNext()");
}
return !completed;
}
@Override
public String toString() {
ListIterator iter = tokens.listIterator(tokenIterator.previousIndex());
StringBuilder sb = new StringBuilder();
if (iter.hasNext()) {
sb.append(iter.next());
int count = 1;
while (iter.hasNext()) {
if (count > 20) {
sb.append(" ...");
break;
}
sb.append(" ");
++count;
sb.append(iter.next());
}
}
return sb.toString();
}
private void moveToNextToken(List newTokens) {
if (newTokens != null && !newTokens.isEmpty()) {
for (Token t : newTokens) {
tokenIterator.add(t);
}
for (int i = 0; i < newTokens.size() - 1; i++) {
tokenIterator.previous();
}
currentToken = newTokens.get(0);
return;
}
// And move the currentToken to the next token ...
if (!tokenIterator.hasNext()) {
completed = true;
currentToken = null;
} else {
currentToken = tokenIterator.next();
}
}
private void moveToNextToken() {
moveToNextToken(null);
}
/**
* Get the current token.
*
* @return the current token; never null
* @throws IllegalStateException if this method was called before the stream was {@link #start()
* started}
* @throws NoSuchElementException if there are no more tokens
*/
final Token currentToken() throws IllegalStateException, NoSuchElementException {
if (currentToken == null) {
if (completed) {
throw new NoSuchElementException("No more content");
}
throw new IllegalStateException("start() method must be called before consuming or matching");
}
return currentToken;
}
/**
* Interface for a Tokenizer component responsible for processing the characters in a {@link
* CharacterStream} and constructing the appropriate {@link Token} objects.
*/
public interface Tokenizer {
/**
* Process the supplied characters and construct the appropriate {@link Token} objects.
*
* @param input the character input stream; never null
* @param tokens the factory for {@link Token} objects, which records the order in which the
* tokens are created
* @throws ParsingException if there is an error while processing the character stream
* (e.g., a quote is not closed, etc.)
*/
void tokenize(CharacterStream input, Tokens tokens) throws ParsingException;
}
/**
* Interface used by a {@link Tokenizer} to iterate through the characters in the content input
* to the {@link TokenStream}.
*/
public interface CharacterStream {
/**
* Determine if there is another character available in this stream.
*
* @return true if there is another character (and {@link #next()} can be called), or false
* otherwise
*/
boolean hasNext();
/**
* Obtain the next character value, and advance the stream.
*
* @return the next character
* @throws NoSuchElementException if there is no {@link #hasNext() next character}
*/
char next();
/**
* Get the index for the last character returned from {@link #next()}.
*
* @return the index of the last character returned
*/
int index();
/**
* Get the position for the last character returned from {@link #next()}.
*
* @param startIndex the starting index
* @return the position of the last character returned; never null
*/
Position position(int startIndex);
}
/**
* A factory for Token objects, used by a {@link Tokenizer} to create tokens in the correct
* order.
*/
public interface Tokens {
/**
* Create a single-character token at the supplied index in the character stream. The token
* type is set to 0, meaning this is equivalent to calling addToken(index,index+1)
*
or addToken(index,index+1,0)
.
*
* @param position the position (line and column numbers) of this new token; may not be null
* @param index the index of the character to appear in the token; must be a valid index in
* the stream
*/
default void addToken(Position position, int index) {
addToken(position, index, index + 1, 0);
}
/**
* Create a single- or multi-character token with the characters in the range given by the
* starting and ending index in the character stream. The character at the ending index is
* not included in the token (as this is standard practice when using 0-based
* indexes). The token type is set to 0, meaning this is equivalent to calling
* addToken(startIndex,endIndex,0)
.
*
* @param position the position (line and column numbers) of this new token; may not be null
* @param startIndex the index of the first character to appear in the token; must be a
* valid index in the stream
* @param endIndex the index just past the last character to appear in the token; must be a
* valid index in the stream
*/
default void addToken(Position position, int startIndex, int endIndex) {
addToken(position, startIndex, endIndex, 0);
}
/**
* Create a single- or multi-character token with the supplied type and with the characters
* in the range given by the starting and ending index in the character stream. The
* character at the ending index is not included in the token (as this is standard
* practice when using 0-based indexes).
*
* @param position the position (line and column numbers) of this new token; may not be null
* @param startIndex the index of the first character to appear in the token; must be a
* valid index in the stream
* @param endIndex the index just past the last character to appear in the token; must be a
* valid index in the stream
* @param type the type of the token
*/
void addToken(Position position, int startIndex, int endIndex, int type);
}
/**
* The interface defining a token, which references the characters in the actual input character
* stream.
*
* @see CaseSensitiveTokenFactory
* @see CaseInsensitiveTokenFactory
*/
@Immutable
public interface Token {
/**
* Get the value of the token, in actual case.
*
* @return the value
*/
String value();
/**
* Determine if the token matches the supplied string.
*
* @param expected the expected value
* @return true if the token's value matches the supplied value, or false otherwise
*/
boolean matches(String expected);
/**
* Determine if the token matches the supplied string and is of a requested type.
*
* @param expectedType the expected token type
* @param expected the expected value
* @return true if the token's type and value matches the supplied type and value, or false
* otherwise
*/
default boolean matches(int expectedType, String expected) {
return matches(expectedType) && matches(expected);
}
/**
* Determine if the token matches the supplied character.
*
* @param expected the expected character value
* @return true if the token's value matches the supplied character value, or false
* otherwise
*/
boolean matches(char expected);
/**
* Determine if the token matches the supplied type.
*
* @param expectedType the expected integer type
* @return true if the token's value matches the supplied integer type, or false otherwise
*/
boolean matches(int expectedType);
/**
* Get the type of the token.
*
* @return the token's type
*/
int type();
/**
* Get the index in the raw stream for the first character in the token.
*
* @return the starting index of the token
*/
int startIndex();
/**
* Get the index in the raw stream past the last character in the token.
*
* @return the ending index of the token, which is past the last character
*/
int endIndex();
/**
* Get the length of the token, which is equivalent to endIndex() - startIndex()
*
.
*
* @return the length
*/
int length();
/**
* Get the position of this token, which includes the line number and column number of the
* first character in the token.
*
* @return the position; never null
*/
Position position();
/**
* Bitmask ORed with existing type value.
*
* @param typeMask the mask of types
* @return copy of Token with new type
*/
Token withType(int typeMask);
}
/** An immutable {@link Token} that implements matching using case-sensitive logic. */
@Immutable
protected class CaseSensitiveToken implements Token {
private final int startIndex;
private final int endIndex;
private final int type;
private final Position position;
public CaseSensitiveToken(int startIndex, int endIndex, int type, Position position) {
this.startIndex = startIndex;
this.endIndex = endIndex;
this.type = type;
this.position = position;
}
@Override
public Token withType(int typeMask) {
int type = this.type | typeMask;
return new CaseSensitiveToken(startIndex, endIndex, type, position);
}
@Override
public final int type() {
return type;
}
@Override
public final int startIndex() {
return startIndex;
}
@Override
public final int endIndex() {
return endIndex;
}
@Override
public final int length() {
return endIndex - startIndex;
}
@Override
public final boolean matches(char expected) {
return length() == 1 && matchString().charAt(startIndex) == expected;
}
@Override
public boolean matches(String expected) {
return matchString().substring(startIndex, endIndex).equals(expected);
}
@Override
public final boolean matches(int expectedType) {
return expectedType == ANY_TYPE || (currentToken().type() & expectedType) == expectedType;
}
@Override
public final String value() {
return inputString.substring(startIndex, endIndex);
}
@Override
public Position position() {
return position;
}
protected String matchString() {
return inputString;
}
@Override
public String toString() {
return value();
}
}
/** An immutable {@link Token} that implements matching using case-insensitive logic. */
@Immutable
protected class CaseInsensitiveToken extends CaseSensitiveToken {
public CaseInsensitiveToken(int startIndex, int endIndex, int type, Position position) {
super(startIndex, endIndex, type, position);
}
@Override
public boolean matches(String expected) {
return matchString()
.substring(startIndex(), endIndex())
.toUpperCase()
.equals(expected);
}
@Override
public Token withType(int typeMask) {
int type = this.type() | typeMask;
return new CaseInsensitiveToken(startIndex(), endIndex(), type, position());
}
}
/** An implementation of {@link Tokens} that creates {@link CaseSensitiveToken} objects. */
protected abstract class TokenFactory implements Tokens {
protected final List tokens = new ArrayList();
public List getTokens() {
return tokens;
}
}
/** An implementation of {@link Tokens} that creates {@link CaseSensitiveToken} objects. */
public class CaseSensitiveTokenFactory extends TokenFactory {
@Override
public void addToken(Position position, int startIndex, int endIndex, int type) {
tokens.add(new CaseSensitiveToken(startIndex, endIndex, type, position));
}
}
/** An implementation of {@link Tokens} that creates {@link CaseInsensitiveToken} objects. */
public class CaseInsensitiveTokenFactory extends TokenFactory {
@Override
public void addToken(Position position, int startIndex, int endIndex, int type) {
tokens.add(new CaseInsensitiveToken(startIndex, endIndex, type, position));
}
}
/** An implementation of {@link CharacterStream} that works with a single character array. */
public static final class CharacterArrayStream implements CharacterStream {
private final char[] content;
private int lastIndex = -1;
private final int maxIndex;
private int lineNumber = 1;
private int columnNumber = 0;
private boolean nextCharMayBeLineFeed;
public CharacterArrayStream(char[] content) {
this.content = content;
this.maxIndex = content.length - 1;
}
@Override
public boolean hasNext() {
return lastIndex < maxIndex;
}
@Override
public int index() {
return lastIndex;
}
@Override
public Position position(int startIndex) {
return new Position(startIndex, lineNumber, columnNumber);
}
@Override
public char next() {
if (lastIndex >= maxIndex) {
throw new NoSuchElementException();
}
char result = content[++lastIndex];
++columnNumber;
if (result == '\r') {
nextCharMayBeLineFeed = true;
++lineNumber;
columnNumber = 0;
} else if (result == '\n') {
if (!nextCharMayBeLineFeed) {
++lineNumber;
}
columnNumber = 0;
} else if (nextCharMayBeLineFeed) {
nextCharMayBeLineFeed = false;
}
return result;
}
}
}