org.antlr.intellij.adaptor.lexer.AntlrLexerAdapter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ide-plugin Show documentation
IDE plugin for generating java interfaces for sqlite files
There is a newer version: 0.9.0
package org.antlr.intellij.adaptor.lexer;

import com.intellij.lang.Language;
import com.intellij.psi.tree.IElementType;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.IntStream;
import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.Token;
import org.jetbrains.annotations.Nullable;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * This is the base class for implementations of {@link com.intellij.lexer.Lexer} backed by an ANTLR 4 lexer.
 *
 * For lexers which do not store custom state information, the default implementation {@link SimpleAntlrAdapter} can
 * be used.
 *
 * @author Sam Harwell
 */
public abstract class AntlrLexerAdapter extends com.intellij.lexer.LexerBase {
	/**
	 * Gets the {@link Language} supported by this lexer. This value is passed to {@link ElementTypeFactory} to ensure
	 * the correct collection of {@link IElementType} is used for assigning element types to tokens in
	 * {@link #getTokenType}.
	 */
	private final Language language;
	/**
	 * This field caches the collection of element types returned by {@link ElementTypeFactory#getTokenElementTypes} for
	 * optimum efficiency of the {@link #getTokenType} method.
	 */
	private final List tokenElementTypes;
	/**
	 * This is the backing field for {@link #getLexer()}.
	 */
	private final Lexer lexer;

	/**
	 * Provides a map from a {@code State} object → state index tracked by IntelliJ. This field provides for an
	 * efficient implementation of {@link #getState}.
	 */
	private final Map stateCacheMap = new HashMap();
	/**
	 * Provides a map from a state index tracked by IntelliJ → {@code State} object describing the ANTLR lexer
	 * state. This field provides for an efficient implementation of {@link #toLexerState}.
	 */
	private final List stateCache = new ArrayList();

	/**
	 * Caches the {@code buffer} provided in the call to {@link #start}, as required for implementing
	 * {@link #getBufferSequence}.
	 */
	private CharSequence buffer;
	/**
	 * Caches the {@code endOffset} provided in the call to {@link #start}, as required for implementing
	 * {@link #getBufferEnd}.
	 */
	private int endOffset;

	/**
	 * This field tracks the "exposed" lexer state, which differs from the actual current state of the lexer returned by
	 * {@link #getLexer()} by one token.
	 *
	 * Due to the way IntelliJ requests token information, the ANTLR {@link Lexer} is always positioned one token
	 * past the token whose information is returned by calls to {@link #getTokenType}, {@link #getTokenType}, etc. When
	 * {@link #getState} is called, IntelliJ expects a state which is able to reproduce the {@link #currentToken}, but
	 * the ANTLR lexer has already moved past it. This field is assigned based in {@link #advance} based on the lexer
	 * state before the current token, after which {@link Lexer#nextToken} can be called to obtain
	 * {@link #currentToken}.
	 */
	private State currentState;
	/**
	 * This field tracks the "exposed" lexer token. This is the result of the most recent call to
	 * {@link Lexer#nextToken} on the underlying ANTLR lexer, and is the source of information for
	 * {@link #getTokenStart}, {@link #getTokenType}, etc.
	 *
	 * @see #currentState
	 */
	private Token currentToken;

	/**
	 * Constructs a new instance of {@link AntlrLexerAdapter} with the specified {@link Language} and underlying
	 * ANTLR {@link Lexer}.
	 *
	 * @param language The language.
	 * @param lexer The underlying ANTLR lexer.
	 */
	public AntlrLexerAdapter(Language language, Lexer lexer) {
		this.language = language;
		this.tokenElementTypes = ElementTypeFactory.getTokenElementTypes(language, Arrays.asList(lexer.getTokenNames()));
		this.lexer = lexer;
	}

	/**
	 * Gets the ANTLR {@link Lexer} used for actual tokenization of the input.
	 *
	 * @return the ANTLR {@link Lexer} instance
	 */
	protected Lexer getLexer() {
		return lexer;
	}

	/**
	 * Gets the {@link Token} object providing information for calls to {@link #getTokenStart}, {@link #getTokenType},
	 * etc.
	 *
	 * @return The current {@link Token} instance.
	 */
	protected Token getCurrentToken() {
		return currentToken;
	}

	@Override
	public void start(CharSequence buffer, int startOffset, int endOffset, int initialState) {
		this.buffer = buffer;
		this.endOffset = endOffset;

		CharStream in = new CharSequenceCharStream(buffer, endOffset, IntStream.UNKNOWN_SOURCE_NAME);
		in.seek(startOffset);

		State state;
		if (startOffset == 0 && initialState == 0) {
			state = getInitialState();
		} else {
			state = toLexerState(initialState);
		}

		applyLexerState(in, state);
		advance();
	}

	@Nullable
	@Override
	public IElementType getTokenType() {
		int type = currentToken.getType();
		if (type == Token.EOF) {
			// return null when lexing is finished
			return null;
		}

		return tokenElementTypes.get(type);
	}

	@Override
	public void advance() {
		currentState = getLexerState(lexer);
		currentToken = lexer.nextToken();
	}

	@Override
	public int getState() {
		State state = currentState != null ? currentState : getInitialState();
		Integer existing = stateCacheMap.get(state);
		if (existing == null) {
			existing = stateCache.size();
			stateCache.add(state);
			stateCacheMap.put(state, existing);
		}

		return existing;
	}

	@Override
	public int getTokenStart() {
		return currentToken.getStartIndex();
	}

	@Override
	public int getTokenEnd() {
		return currentToken.getStopIndex() + 1;
	}

	@Override
	public CharSequence getBufferSequence() {
		return buffer;
	}

	@Override
	public int getBufferEnd() {
		return endOffset;
	}

	/**
	 * Update the current lexer to use the specified {@code input} stream starting in the specified {@code state}.
	 *
	 * The current lexer may be obtained by calling {@link #getLexer}. The default implementation calls
	 * {@link Lexer#setInputStream} to set the input stream, followed by {@link AntlrLexerState#apply} to initialize the
	 * state of the lexer.
	 *
	 * @param input The new input stream for the lexer.
	 * @param state A {@code State} instance containing the starting state for the lexer.
	 */
	protected void applyLexerState(CharStream input, State state) {
		lexer.setInputStream(input);
		state.apply(lexer);
	}

	/**
	 * Get the initial {@code State} of the lexer.
	 *
	 * @return a {@code State} instance representing the state of the lexer at the beginning of an input.
	 */
	protected abstract State getInitialState();

	/**
	 * Get a {@code State} instance representing the current state of the specified lexer.
	 *
	 * @param lexer The lexer.
	 * @return A {@code State} instance containing the current state of the lexer.
	 */
	protected abstract State getLexerState(Lexer lexer);

	/**
	 * Gets the {@code State} corresponding to the specified IntelliJ {@code state}.
	 *
	 * @param state The lexer state provided by IntelliJ.
	 * @return The {@code State} instance corresponding to the specified state.
	 */
	protected State toLexerState(int state) {
		return stateCache.get(state);
	}
}