org.textmapper.tool.parser.TMLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of textmapper Show documentation
There is a newer version: 0.9.5
/**
 * Copyright 2002-2020 Evgeny Gryaznov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.textmapper.tool.parser;

import java.io.IOException;
import java.io.Reader;
import java.text.MessageFormat;
import java.util.HashMap;
import java.util.Map;
import org.textmapper.tool.parser.action.SActionLexer;
import org.textmapper.tool.parser.action.SActionParser;

public class TMLexer {

	public static class Span {
		public Object value;
		public int symbol;
		public int state;
		public int line;
		public int offset;
		public int endoffset;
	}

	public interface States {
		int initial = 0;
		int afterID = 1;
		int afterColonOrEq = 2;
		int afterGT = 3;
	}

	public interface Tokens {
		int Unavailable_ = -1;
		int eoi = 0;
		int scon = 1;
		int icon = 2;
		int _skip = 3;
		int _skip_comment = 4;
		int _skip_multiline = 5;
		int Rem = 6;
		int ColonColon = 7;
		int Or = 8;
		int OrOr = 9;
		int Assign = 10;
		int AssignAssign = 11;
		int ExclAssign = 12;
		int Semicolon = 13;
		int Dot = 14;
		int Comma = 15;
		int Colon = 16;
		int Lbrack = 17;
		int Rbrack = 18;
		int Lparen = 19;
		int LparenQuestAssign = 20;
		int MinusGt = 21;
		int Rparen = 22;
		int Rbrace = 23;
		int Lt = 24;
		int Gt = 25;
		int Mult = 26;
		int Plus = 27;
		int PlusAssign = 28;
		int Quest = 29;
		int Excl = 30;
		int Tilde = 31;
		int And = 32;
		int AndAnd = 33;
		int Dollar = 34;
		int Atsign = 35;
		int error = 36;
		int ID = 37;
		int as = 38;
		int _false = 39;
		int _implements = 40;
		int _import = 41;
		int separator = 42;
		int set = 43;
		int _true = 44;
		int _assert = 45;
		int brackets = 46;
		int _class = 47;
		int empty = 48;
		int expect = 49;
		int expectMinusrr = 50;
		int explicit = 51;
		int flag = 52;
		int generate = 53;
		int global = 54;
		int inline = 55;
		int input = 56;
		int _interface = 57;
		int lalr = 58;
		int language = 59;
		int layout = 60;
		int left = 61;
		int lexer = 62;
		int lookahead = 63;
		int noMinuseoi = 64;
		int nonassoc = 65;
		int nonempty = 66;
		int param = 67;
		int parser = 68;
		int prec = 69;
		int returns = 70;
		int right = 71;
		int char_s = 72;
		int shift = 73;
		int space = 74;
		int _void = 75;
		int char_x = 76;
		int code = 77;
		int Lbrace = 78;
		int regexp = 79;
		int Div = 80;
	}

	public interface ErrorReporter {
		void error(String message, int line, int offset, int endoffset);
	}

	public static final int TOKEN_SIZE = 2048;

	private Reader stream;
	final private ErrorReporter reporter;

	private CharSequence input;
	private int tokenOffset;
	private int l;
	private int charOffset;
	private int chr;

	private int state;

	private int tokenLine;
	private int currLine;
	private int currOffset;

	protected boolean inStatesSelector = false;
	protected boolean afterColonColon = false;
	private int templatesStart = -1;
	private boolean skipComments = true;

	int getTemplatesStart() {
		return templatesStart;
	}

	public void setSkipComments(boolean skip) {
		this.skipComments = skip;
	}

	private boolean skipAction() throws IOException {
		final int[] ind = new int[] { 0 };
		SActionLexer.ErrorReporter innerreporter = (String message, int line, int offset) ->
				reporter.error(message, line, offset, offset + 1);
		SActionLexer l = new SActionLexer(innerreporter) {
			@Override
			protected int nextChar() throws IOException {
				if (ind[0] < 2) {
					return ind[0]++ == 0 ? '{' : chr;
				}
				TMLexer.this.advance();
				return chr;
			}
		};
		SActionParser p = new SActionParser(innerreporter);
		try {
			p.parse(l);
		} catch (SActionParser.ParseException e) {
			reporter.error("syntax error in action", getLine(), getOffset(), getOffset() + 1);
			return false;
		}
		return true;
	}

	private String unescape(String s, int start, int end) {
		StringBuilder sb = new StringBuilder();
		end = Math.min(end, s.length());
		for (int i = start; i < end; i++) {
			char c = s.charAt(i);
			if (c == '\\') {
				if (++i == end) {
					break;
				}
				c = s.charAt(i);
				if (c == 'u' || c == 'x') {
					// FIXME process unicode
				} else if (c == 'n') {
					sb.append('\n');
				} else if (c == 'r') {
					sb.append('\r');
				} else if (c == 't') {
					sb.append('\t');
				} else {
					sb.append(c);
				}
			} else {
				sb.append(c);
			}
		}
		return sb.toString();
	}

	public TMLexer(CharSequence input, ErrorReporter reporter) throws IOException {
		this.reporter = reporter;
		reset(input);
	}

	public void reset(CharSequence input) throws IOException {
		this.state = 0;
		tokenLine = currLine = 1;
		currOffset = 0;
		this.input = input;
		tokenOffset = l = 0;
		charOffset = l;
		chr = l < input.length() ? input.charAt(l++) : -1;
		if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() &&
				Character.isLowSurrogate(input.charAt(l))) {
			chr = Character.toCodePoint((char) chr, input.charAt(l++));
		}
		inStatesSelector = false;
		afterColonColon = false;
	}

	protected void advance() {
		if (chr == -1) return;
		currOffset += l - charOffset;
		if (chr == '\n') {
			currLine++;
		}
		charOffset = l;
		chr = l < input.length() ? input.charAt(l++) : -1;
		if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() &&
				Character.isLowSurrogate(input.charAt(l))) {
			chr = Character.toCodePoint((char) chr, input.charAt(l++));
		}
	}

	public int getState() {
		return state;
	}

	public void setState(int state) {
		this.state = state;
	}

	public int getTokenLine() {
		return tokenLine;
	}

	public int getLine() {
		return currLine;
	}

	public void setLine(int currLine) {
		this.currLine = currLine;
	}

	public int getOffset() {
		return currOffset;
	}

	public void setOffset(int currOffset) {
		this.currOffset = currOffset;
	}

	public String tokenText() {
		return input.subSequence(tokenOffset, charOffset).toString();
	}

	public int tokenSize() {
		return charOffset - tokenOffset;
	}

	private static final short tmCharClass[] = {
		1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 4, 1, 1,
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
		2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
		20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22, 23, 24, 25, 26,
		27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
		28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 30, 31, 1, 28,
		1, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
		28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 32, 33, 34, 35
	};

	private static final short tmStateMap[] = {
		0, 0, 52, 62
	};

	private static final short tmBacktracking[] = {
		38, 9, 81, 21, 21, 33
	};

	private static final int tmFirstRule = -4;

	private static final int[] tmRuleSymbol = unpack_int(82,
		"\uffff\uffff\0\0\1\0\2\0\0\0\3\0\4\0\5\0\6\0\7\0\10\0\11\0\12\0\13\0\14\0\15\0\16" +
		"\0\17\0\20\0\21\0\22\0\23\0\24\0\25\0\26\0\27\0\30\0\31\0\32\0\33\0\34\0\35\0\36" +
		"\0\37\0\40\0\41\0\42\0\43\0\45\0\46\0\47\0\50\0\51\0\52\0\53\0\54\0\55\0\56\0\57" +
		"\0\60\0\61\0\62\0\63\0\64\0\65\0\66\0\67\0\70\0\71\0\72\0\73\0\74\0\75\0\76\0\77" +
		"\0\100\0\101\0\102\0\103\0\104\0\105\0\106\0\107\0\110\0\111\0\112\0\113\0\114\0" +
		"\115\0\116\0\117\0\120\0");

	private static final int tmClassesCount = 36;

	private static final short[] tmGoto = unpack_vc_short(2304,
		"\1\ufffb\1\ufffc\3\63\1\61\1\56\1\54\1\53\1\50\1\46\1\43\1\40\1\37\1\36\1\34\1\33" +
		"\1\31\1\30\1\24\1\23\1\21\1\20\1\17\1\15\1\14\1\13\1\12\1\10\1\7\1\ufffc\1\6\1\5" +
		"\1\3\1\2\1\1\44\uffdb\44\uffe3\41\ufff2\1\4\2\ufff2\44\ufff1\44\uffae\44\uffe8\44" +
		"\uffe9\21\uffd6\1\uffff\2\uffd6\1\10\7\uffd6\1\10\7\uffd6\21\ufffc\1\11\2\ufffc\1" +
		"\10\7\ufffc\1\10\7\ufffc\44\uffd7\44\uffdd\44\uffe1\30\ufff0\1\16\13\ufff0\44\uffef" +
		"\44\uffe2\44\uffed\25\uffea\1\22\16\uffea\44\ufff3\24\ufff9\1\23\17\ufff9\16\uffab" +
		"\1\ufffe\25\uffab\1\ufffc\15\25\1\26\25\25\1\ufffc\15\25\1\26\4\25\1\27\20\25\44" +
		"\ufff5\44\uffec\24\ufffc\1\23\4\ufffc\1\32\12\ufffc\44\uffe5\44\uffeb\30\uffdf\1" +
		"\35\13\uffdf\44\uffde\44\uffe0\44\uffe4\32\uffe7\1\ufffd\11\uffe7\30\ufffc\1\42\13" +
		"\ufffc\44\uffe6\1\ufffc\2\43\1\ufffc\7\43\1\45\22\43\1\44\5\43\1\ufffc\2\43\1\ufffc" +
		"\40\43\44\uffd6\12\uffda\1\47\31\uffda\44\uffd9\11\ufff4\1\51\32\ufff4\1\ufff8\2" +
		"\51\1\52\40\51\44\ufff8\44\uffd8\1\ufff6\2\54\1\55\40\54\44\ufff6\1\ufffc\2\56\1" +
		"\ufffc\2\56\1\60\27\56\1\57\5\56\1\ufffc\2\56\1\ufffc\40\56\44\ufffa\30\uffdc\1\62" +
		"\13\uffdc\44\uffee\2\ufff7\3\63\37\ufff7\2\ufffc\3\63\1\61\1\56\1\54\1\53\1\50\1" +
		"\46\1\43\1\40\1\37\1\36\1\34\1\33\1\31\1\30\1\65\1\23\1\21\1\20\1\17\1\15\1\14\1" +
		"\13\1\12\1\10\1\7\1\ufffc\1\6\1\5\1\3\1\2\1\1\1\ufffc\2\71\2\ufffc\11\71\1\25\4\71" +
		"\1\ufffc\11\71\1\67\1\66\5\71\1\ufffc\2\71\1\ufffc\40\71\1\ufffc\2\67\2\ufffc\31" +
		"\67\1\70\1\71\4\67\1\ufffc\2\67\1\ufffc\40\67\1\ufffc\2\71\2\ufffc\16\71\1\75\11" +
		"\71\1\73\1\72\5\71\1\ufffc\2\71\1\ufffc\40\71\1\ufffc\2\73\2\ufffc\31\73\1\74\1\71" +
		"\4\73\1\ufffc\2\73\1\ufffc\40\73\44\uffac\2\ufffc\3\63\1\61\1\56\1\54\1\53\1\50\1" +
		"\46\1\43\1\40\1\37\1\36\1\34\1\33\1\31\1\30\1\24\1\23\1\21\1\20\1\17\1\15\1\14\1" +
		"\13\1\12\1\10\1\7\1\ufffc\1\6\1\77\1\3\1\2\1\1\44\uffad");

	private static short[] unpack_vc_short(int size, String... st) {
		short[] res = new short[size];
		int t = 0;
		int count = 0;
		for (String s : st) {
			int slen = s.length();
			for (int i = 0; i < slen; ) {
				count = i > 0 || count == 0 ? s.charAt(i++) : count;
				if (i < slen) {
					short val = (short) s.charAt(i++);
					while (count-- > 0) res[t++] = val;
				}
			}
		}
		assert res.length == t;
		return res;
	}

	private static int mapCharacter(int chr) {
		if (chr >= 0 && chr < 127) return tmCharClass[chr];
		return chr == -1 ? 0 : 1;
	}

	public Span next() throws IOException {
		Span token = new Span();
		int state;

		tokenloop:
		do {
			token.offset = currOffset;
			tokenLine = token.line = currLine;
			tokenOffset = charOffset;

			// TODO use backupRule
			int backupRule = -1;
			for (state = tmStateMap[this.state]; state >= 0; ) {
				state = tmGoto[state * tmClassesCount + mapCharacter(chr)];
				if (state > tmFirstRule && state < 0) {
					token.endoffset = currOffset;
					state = (-1 - state) * 2;
					backupRule = tmBacktracking[state++];
					state = tmBacktracking[state];
				}
				if (state == tmFirstRule && chr == -1) {
					token.endoffset = currOffset;
					token.symbol = 0;
					token.value = null;
					reporter.error("Unexpected end of input reached", token.line, token.offset, token.endoffset);
					token.offset = currOffset;
					break tokenloop;
				}
				if (state >= tmFirstRule && chr != -1) {
					currOffset += l - charOffset;
					if (chr == '\n') {
						currLine++;
					}
					charOffset = l;
					chr = l < input.length() ? input.charAt(l++) : -1;
					if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() &&
							Character.isLowSurrogate(input.charAt(l))) {
						chr = Character.toCodePoint((char) chr, input.charAt(l++));
					}
				}
			}
			token.endoffset = currOffset;

			token.symbol = tmRuleSymbol[tmFirstRule - state];
			token.value = null;

			if (token.symbol == -1) {
				reporter.error(MessageFormat.format("invalid token at line {0}: `{1}`, skipped", currLine, tokenText()), token.line, token.offset, token.endoffset);
			}

		} while (token.symbol == -1 || !createToken(token, tmFirstRule - state));
		switch (token.symbol) {
		case Tokens.Lt:
			inStatesSelector = this.state == States.initial || this.state == States.afterColonOrEq;
			this.state = States.initial;
			break;
		case Tokens.Gt:
			this.state = inStatesSelector ? States.afterGT : States.initial;
			inStatesSelector = false;
			break;
		case Tokens.Assign:
		case Tokens.Colon:
			this.state = States.afterColonOrEq;
			break;
		case Tokens.ID:
		case Tokens.left:
		case Tokens.right:
		case Tokens.nonassoc:
		case Tokens.generate:
		case Tokens._assert:
		case Tokens.empty:
		case Tokens.brackets:
		case Tokens.inline:
		case Tokens.prec:
		case Tokens.shift:
		case Tokens.returns:
		case Tokens.input:
		case Tokens.nonempty:
		case Tokens.global:
		case Tokens.explicit:
		case Tokens.lookahead:
		case Tokens.param:
		case Tokens.flag:
		case Tokens.noMinuseoi:
		case Tokens.char_s:
		case Tokens.char_x:
		case Tokens._class:
		case Tokens._interface:
		case Tokens._void:
		case Tokens.space:
		case Tokens.layout:
		case Tokens.language:
		case Tokens.lalr:
		  this.state = States.afterID;
		  break;
		case Tokens.lexer:
		case Tokens.parser:
		  this.state = afterColonColon ? States.initial : States.afterID;
		  break;
		case Tokens._skip:
		case Tokens._skip_comment:
		case Tokens._skip_multiline:
		  // Note: these do not affect the '::' tracking.
			return token;
		default:
			this.state = States.initial;
		}
		this.afterColonColon = (token.symbol == Tokens.ColonColon);
		return token;
	}

	protected int charAt(int i) {
		if (i == 0) return chr;
		i += l - 1;
		int res = i < input.length() ? input.charAt(i++) : -1;
		if (res >= Character.MIN_HIGH_SURROGATE && res <= Character.MAX_HIGH_SURROGATE && i < input.length() &&
				Character.isLowSurrogate(input.charAt(i))) {
			res = Character.toCodePoint((char) res, input.charAt(i++));
		}
		return res;
	}

	protected boolean createToken(Span token, int ruleIndex) throws IOException {
		boolean spaceToken = false;
		switch (ruleIndex) {
			case 2: // scon: /"([^\n\\"]|\\.)*"/
				{ token.value = unescape(tokenText(), 1, tokenSize()-1); }
				break;
			case 3: // icon: /\-?[0-9]+/
				{ token.value = Integer.parseInt(tokenText()); }
				break;
			case 4: // eoi: /%%.*(\r?\n)?/
				{ templatesStart = token.endoffset; }
				break;
			case 5: // _skip: /[\n\r\t ]+/
				spaceToken = true;
				break;
			case 6: // _skip_comment: /#.*(\r?\n)?/
				{ spaceToken = skipComments; }
				break;
			case 7: // _skip_multiline: /\/\*{commentChars}\*\//
				spaceToken = true;
				break;
			case 38:
				return createIDToken(token, ruleIndex);
			case 78: // code: /\{/
				{ skipAction(); token.endoffset = getOffset(); }
				break;
			case 80: // regexp: /\/{reFirst}{reChar}*\//
				{ token.value = tokenText().substring(1, tokenSize()-1); }
				break;
		}
		return !(spaceToken);
	}

	private static Map subTokensOfID = new HashMap<>();
	static {
		subTokensOfID.put("as", 39);
		subTokensOfID.put("false", 40);
		subTokensOfID.put("implements", 41);
		subTokensOfID.put("import", 42);
		subTokensOfID.put("separator", 43);
		subTokensOfID.put("set", 44);
		subTokensOfID.put("true", 45);
		subTokensOfID.put("assert", 46);
		subTokensOfID.put("brackets", 47);
		subTokensOfID.put("class", 48);
		subTokensOfID.put("empty", 49);
		subTokensOfID.put("expect", 50);
		subTokensOfID.put("expect-rr", 51);
		subTokensOfID.put("explicit", 52);
		subTokensOfID.put("flag", 53);
		subTokensOfID.put("generate", 54);
		subTokensOfID.put("global", 55);
		subTokensOfID.put("inline", 56);
		subTokensOfID.put("input", 57);
		subTokensOfID.put("interface", 58);
		subTokensOfID.put("lalr", 59);
		subTokensOfID.put("language", 60);
		subTokensOfID.put("layout", 61);
		subTokensOfID.put("left", 62);
		subTokensOfID.put("lexer", 63);
		subTokensOfID.put("lookahead", 64);
		subTokensOfID.put("no-eoi", 65);
		subTokensOfID.put("nonassoc", 66);
		subTokensOfID.put("nonempty", 67);
		subTokensOfID.put("param", 68);
		subTokensOfID.put("parser", 69);
		subTokensOfID.put("prec", 70);
		subTokensOfID.put("returns", 71);
		subTokensOfID.put("right", 72);
		subTokensOfID.put("s", 73);
		subTokensOfID.put("shift", 74);
		subTokensOfID.put("space", 75);
		subTokensOfID.put("void", 76);
		subTokensOfID.put("x", 77);
	}

	protected boolean createIDToken(Span token, int ruleIndex) {
		Integer replacement = subTokensOfID.get(tokenText());
		if (replacement != null) {
			ruleIndex = replacement;
			token.symbol = tmRuleSymbol[ruleIndex];
		}
		return true;
	}

	/* package */ static int[] unpack_int(int size, String... st) {
		int[] res = new int[size];
		boolean second = false;
		char first = 0;
		int t = 0;
		for (String s : st) {
			int slen = s.length();
			for (int i = 0; i < slen; i++) {
				if (second) {
					res[t++] = (s.charAt(i) << 16) + first;
				} else {
					first = s.charAt(i);
				}
				second = !second;
			}
		}
		assert !second;
		assert res.length == t;
		return res;
	}

}