org.textmapper.lapg.eval.GenericLexer Maven / Gradle / Ivy
/**
* Copyright 2002-2018 Evgeny Gryaznov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.textmapper.lapg.eval;
import java.io.IOException;
import java.io.Reader;
import java.text.MessageFormat;
import org.textmapper.lapg.api.Grammar;
import org.textmapper.lapg.api.LexerData;
import org.textmapper.lapg.api.LexerRule;
public class GenericLexer {
public static class Span {
public Object value;
public int symbol;
public int state;
public int line;
public int offset;
public int endoffset;
}
public interface Tokens {
int Unavailable_ = -1;
int eoi = 0;
}
public interface ErrorReporter {
void error(String message, int line, int offset, int endoffset);
}
public static final int TOKEN_SIZE = 2048;
private Reader stream;
final private ErrorReporter reporter;
private CharSequence input;
private int tokenOffset;
private int l;
private int charOffset;
private int chr;
private int state;
private int tokenLine;
private int currLine;
private int currOffset;
private final Grammar grammar;
private final int[] tmCharClass;
private final int[] tmRuleSymbol;
private final int[] tmGoto;
private final int[] tmStateMap;
private final int[] tmBacktracking;
private final int tmFirstRule;
private final int tmClassesCount;
public GenericLexer(CharSequence input, ErrorReporter reporter, LexerData lexerData, Grammar grammar) throws IOException {
this.reporter = reporter;
this.grammar = grammar;
tmRuleSymbol = getRuleSymbols(grammar);
tmCharClass = lexerData.getChar2no();
tmGoto = lexerData.getChange();
tmClassesCount = lexerData.getNchars();
tmStateMap = lexerData.getGroupset();
tmBacktracking = lexerData.getBacktracking();
tmFirstRule = -1 - tmBacktracking.length/2;
reset(input);
}
public void reset(CharSequence input) throws IOException {
this.state = 0;
tokenLine = currLine = 1;
currOffset = 0;
this.input = input;
tokenOffset = l = 0;
charOffset = l;
chr = l < input.length() ? input.charAt(l++) : -1;
if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() &&
Character.isLowSurrogate(input.charAt(l))) {
chr = Character.toCodePoint((char) chr, input.charAt(l++));
}
}
protected void advance() {
if (chr == -1) return;
currOffset += l - charOffset;
if (chr == '\n') {
currLine++;
}
charOffset = l;
chr = l < input.length() ? input.charAt(l++) : -1;
if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() &&
Character.isLowSurrogate(input.charAt(l))) {
chr = Character.toCodePoint((char) chr, input.charAt(l++));
}
}
public int getState() {
return state;
}
public void setState(int state) {
this.state = state;
}
public int getTokenLine() {
return tokenLine;
}
public int getLine() {
return currLine;
}
public void setLine(int currLine) {
this.currLine = currLine;
}
public int getOffset() {
return currOffset;
}
public void setOffset(int currOffset) {
this.currOffset = currOffset;
}
public String tokenText() {
return input.subSequence(tokenOffset, charOffset).toString();
}
public int tokenSize() {
return charOffset - tokenOffset;
}
private int mapCharacter(int chr) {
if (chr >= 0 && chr < tmCharClass.length) return tmCharClass[chr];
return chr == -1 ? 0 : 1;
}
public Span next() throws IOException {
Span token = new Span();
int state;
tokenloop:
do {
token.offset = currOffset;
tokenLine = token.line = currLine;
tokenOffset = charOffset;
// TODO use backupRule
int backupRule = -1;
for (state = tmStateMap[this.state]; state >= 0; ) {
state = tmGoto[state * tmClassesCount + mapCharacter(chr)];
if (state > tmFirstRule && state < 0) {
token.endoffset = currOffset;
state = (-1 - state) * 2;
backupRule = tmBacktracking[state++];
state = tmBacktracking[state];
}
if (state == tmFirstRule && chr == -1) {
token.endoffset = currOffset;
token.symbol = 0;
token.value = null;
reporter.error("Unexpected end of input reached", token.line, token.offset, token.endoffset);
token.offset = currOffset;
break tokenloop;
}
if (state >= tmFirstRule && chr != -1) {
currOffset += l - charOffset;
if (chr == '\n') {
currLine++;
}
charOffset = l;
chr = l < input.length() ? input.charAt(l++) : -1;
if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() &&
Character.isLowSurrogate(input.charAt(l))) {
chr = Character.toCodePoint((char) chr, input.charAt(l++));
}
}
}
token.endoffset = currOffset;
token.symbol = tmRuleSymbol[tmFirstRule - state];
token.value = null;
if (token.symbol == -1) {
reporter.error(MessageFormat.format("invalid token at line {0}: `{1}`, skipped", currLine, tokenText()), token.line, token.offset, token.endoffset);
}
} while (token.symbol == -1 || !createToken(token, tmFirstRule - state));
return token;
}
protected int charAt(int i) {
if (i == 0) return chr;
i += l - 1;
int res = i < input.length() ? input.charAt(i++) : -1;
if (res >= Character.MIN_HIGH_SURROGATE && res <= Character.MAX_HIGH_SURROGATE && i < input.length() &&
Character.isLowSurrogate(input.charAt(i))) {
res = Character.toCodePoint((char) res, input.charAt(i++));
}
return res;
}
protected boolean createToken(Span token, int ruleIndex) throws IOException {
int lexemeKind = ruleIndex > 1 ? grammar.getLexerRules()[ruleIndex-2].getKind() : LexerRule.KIND_NONE;
return lexemeKind != LexerRule.KIND_SPACE;
}
private static int[] getRuleSymbols(Grammar grammar) {
LexerRule[] lexerRules = grammar.getLexerRules();
int[] result = new int[lexerRules.length + 2];
result[0] = grammar.getInvalidToken() != null ? grammar.getInvalidToken().getIndex() : -1;
result[1] = grammar.getEoi().getIndex();
for (int i = 0; i < lexerRules.length; i++) {
result[i + 2] = lexerRules[i].getSymbol().getIndex();
}
return result;
}
}