org.ssssssss.script.parsing.Tokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of magic-script Show documentation
magic script
The newest version!
package org.ssssssss.script.parsing;

import org.ssssssss.script.MagicScriptError;
import org.ssssssss.script.exception.StringLiteralException;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;


public class Tokenizer {

	public static TokenStream tokenize(String source) {
		return tokenize(source, false);
	}

	public static TokenStream tokenize(String source, boolean matchComment) {
		CharacterStream stream = new CharacterStream(source, 0, source.length());
		List tokens = new ArrayList<>();
		tokenizer(stream, tokens, matchComment, null);
		return new TokenStream(tokens);
	}

	private static List tokenizer(CharacterStream stream, List tokens, boolean matchComment, String except) {
		int leftCount = 0;
		int rightCount = 0;
		outer:
		while (stream.hasMore()) {
			stream.skipWhiteSpace();
			stream.startSpan();
			if (except != null && stream.match(except, true)) {
				return tokens;
			}
			// // /* */
			if (tokenizerComment(stream, tokens, matchComment)) {
				continue;
			}
			// int double long float byte decimal
			if (tokenizerNumber(stream, tokens)) {
				continue;
			}
			// '' """ """ ""
			if (tokenizerString(stream, TokenType.SingleQuote, tokens) || tokenizerString(stream, TokenType.TripleQuote, tokens) || tokenizerString(stream, TokenType.DoubleQuote, tokens)) {
				continue;
			}

			// regexp
			if (regexpToken(stream, tokens)) {
				continue;
			}
			// ``` ```
			if (tokenizerLanguage(stream, tokens)) {
				continue;
			}
			// template string
			if (tokenizerTemplateString(stream, tokens, matchComment)) {
				continue;
			}
			// Identifier, keyword, boolean literal, or null literal
			if (tokenizerIdentifier(stream, tokens)) {
				continue;
			}
			// lambda
			if (stream.match("=>", true) || stream.match("->", true)) {
				tokens.add(new Token(TokenType.Lambda, stream.getSpan(stream.getPosition() - 2, stream.getPosition())));
				continue;
			}
			// Simple tokens
			for (TokenType t : TokenType.getSortedValues()) {
				if (t.getLiteral() != null) {
					if (stream.match(t.getLiteral(), true)) {
						if (t == TokenType.LeftCurly) {
							leftCount++;
						}
						tokens.add(new Token(t, stream.getSpan(stream.getPosition() - t.getLiteral().length(), stream.getPosition())));
						continue outer;
					}
				}
			}
			if (leftCount != rightCount && stream.match("}", true)) {
				rightCount++;
				tokens.add(new Token(TokenType.RightCurly, stream.getSpan(stream.getPosition() - 1, stream.getPosition())));
				continue outer;
			}
			if (stream.hasMore()) {
				MagicScriptError.error("Unknown token", stream.getSpan(stream.getPosition(), stream.getPosition() + 1));
			}
		}
		return tokens;
	}

	private static boolean tokenizerLanguage(CharacterStream stream, List tokens) {
		// TODO exception
		if (stream.match("```", true)) {
			stream.startSpan();
			if (stream.matchIdentifierStart(true)) {
				while (stream.matchIdentifierPart(true)) {
					;
				}
				Span language = stream.endSpan();
				tokens.add(new Token(TokenType.Language, language));
				stream.startSpan();
				if (!stream.skipUntil("```")) {
					MagicScriptError.error("```需要以```结尾", stream.endSpan(), new StringLiteralException());
				}
				tokens.add(new Token(TokenType.Language, stream.endSpan(-3)));
				return true;
			} else {
				MagicScriptError.error("```后需要标识语言类型", stream.endSpan(), new StringLiteralException());
			}
		}
		return false;
	}

	private static boolean tokenizerTemplateString(CharacterStream stream, List tokens, boolean matchComment) {
		if (stream.match("`", true)) {
			int begin = stream.getPosition();
			int start = begin;
			boolean matchedEndQuote = false;
			List subTokens = new ArrayList<>();
			while (stream.hasMore()) {
				if (stream.match("\\", true)) {
					stream.consume();
					continue;
				}
				if (stream.match("`", true)) {
					matchedEndQuote = true;
					break;
				}
				if (stream.match("${", true)) {
					int end = stream.getPosition();
					if (start < end - 2) {
						subTokens.add(new LiteralToken(TokenType.StringLiteral, stream.endSpan(start, end - 2)));
					}
					subTokens.addAll(tokenizer(stream, new ArrayList<>(), matchComment, "}"));
					start = stream.getPosition();
					continue;
				}
				stream.consume();
			}
			if (!matchedEndQuote) {
				MagicScriptError.error("模板字符串没有结束符`", stream.endSpan(), new StringLiteralException());
			}
			Span stringSpan = stream.endSpan(begin, stream.getPosition());
			int end = stream.getPosition() - 1;
			if (end - start > 0) {
				subTokens.add(new LiteralToken(TokenType.StringLiteral, stream.endSpan(start, end)));
			}
			stringSpan = stream.getSpan(stringSpan.getStart() - 1, stringSpan.getEnd());
			tokens.add(new LiteralToken(TokenType.StringLiteral, stringSpan, new TokenStream(subTokens)));
			return true;
		}
		return false;
	}

	private static boolean tokenizerIdentifier(CharacterStream stream, List tokens) {
		if (stream.matchIdentifierStart(true)) {
			stream.startSpan();
			while (stream.matchIdentifierPart(true)) {
				;
			}
			Span identifierSpan = stream.endSpan();
			identifierSpan = stream.getSpan(identifierSpan.getStart() - 1, identifierSpan.getEnd());
			if ("true".equals(identifierSpan.getText()) || "false".equals(identifierSpan.getText())) {
				tokens.add(new LiteralToken(TokenType.BooleanLiteral, identifierSpan));
			} else if ("null".equals(identifierSpan.getText())) {
				tokens.add(new LiteralToken(TokenType.NullLiteral, identifierSpan));
			} else if ("instanceof".equals(identifierSpan.getText())) {
				tokens.add(new Token(TokenType.InstanceOf, identifierSpan));
			} else if (TokenType.SqlAnd.getLiteral().equalsIgnoreCase(identifierSpan.getText())) {
				tokens.add(new Token(TokenType.SqlAnd, identifierSpan));
			} else if (TokenType.SqlOr.getLiteral().equalsIgnoreCase(identifierSpan.getText())) {
				tokens.add(new Token(TokenType.SqlOr, identifierSpan));
			} else {
				tokens.add(new Token(TokenType.Identifier, identifierSpan));
			}
			return true;
		}
		return false;
	}

	private static boolean tokenizerComment(CharacterStream stream, List tokens, boolean matchComment) {
		if (stream.match("//", true)) {    //注释
			stream.skipLine();
			if (matchComment) {
				tokens.add(new Token(TokenType.Comment, stream.endSpan()));
			}
			return true;
		}
		stream.startSpan();
		if (stream.match("/*", true)) {    //多行注释
			stream.skipUntil("*/");
			if (matchComment) {
				tokens.add(new Token(TokenType.Comment, stream.endSpan()));
			}
			return true;
		}
		return false;
	}

	private static boolean tokenizerNumber(CharacterStream stream, List tokens) {
		if (stream.match("0", false)) {
			int index = stream.getPosition();
			stream.startSpan();
			stream.consume();
			if (stream.matchAny(true, "x", "X")) {    // 0x 16进制
				while (stream.matchDigit(true) || stream.matchAny(true, "A", "B", "C", "D", "E", "F", "a", "b", "c", "d", "e", "f", "_")) {
					;
				}
				if (stream.matchAny(true, "L", "l")) {
					Span span = stream.endSpan();
					String text = span.getText();
					tokens.add(new LiteralToken(TokenType.LongLiteral, span, Long.parseLong(text.substring(2, text.length() - 1).replace("_",""), 16)));
					return true;
				}
				tokens.add(autoNumberType(stream.endSpan(), 16));
				return true;
			} else if (stream.matchAny(true, "b", "B")) {    //二进制
				while (stream.matchAny(true, "0", "1", "_")) {
					;
				}
				if (stream.matchAny(true, "L", "l")) {
					Span span = stream.endSpan();
					String text = span.getText();
					tokens.add(new LiteralToken(TokenType.LongLiteral, span, Long.parseLong(text.substring(2, text.length() - 1).replace("_",""), 2)));
					return true;
				}
				tokens.add(autoNumberType(stream.endSpan(), 2));
				return true;
			}
			stream.reset(index);
		}
		if (stream.matchDigit(false)) {
			TokenType type = TokenType.IntegerLiteral;
			stream.startSpan();
			while (stream.matchDigit(true) || stream.match("_", true)) {
				;
			}
			if (stream.match(TokenType.Period.getLiteral(), true)) {
				type = TokenType.DoubleLiteral;
				while (stream.matchDigit(true) || stream.match("_", true)) {
					;
				}
			}
			if (stream.matchAny(true, "b", "B")) {
				if (type == TokenType.DoubleLiteral) {
					MagicScriptError.error("Byte literal can not have a decimal point.", stream.endSpan());
				}
				type = TokenType.ByteLiteral;
			} else if (stream.matchAny(true, "s", "S")) {
				if (type == TokenType.DoubleLiteral) {
					MagicScriptError.error("Short literal can not have a decimal point.", stream.endSpan());
				}
				type = TokenType.ShortLiteral;
			} else if (stream.matchAny(true, "L", "l")) {
				if (type == TokenType.DoubleLiteral) {
					MagicScriptError.error("Long literal can not have a decimal point.", stream.endSpan());
				}
				type = TokenType.LongLiteral;
			} else if (stream.matchAny(true, "f", "F")) {
				type = TokenType.FloatLiteral;
			} else if (stream.matchAny(true, "d", "D")) {
				type = TokenType.DoubleLiteral;
			} else if (stream.matchAny(true, "m", "M")) {
				type = TokenType.DecimalLiteral;
			}
			Span numberSpan = stream.endSpan();
			tokens.add(new LiteralToken(type, numberSpan));
			return true;
		}
		return false;
	}

	private static LiteralToken autoNumberType(Span span, int radix) {
		String text = span.getText();
		// fix 0b 0B -0b -0B
		if(text.length() == 2 && radix == 2) {
			return new LiteralToken(TokenType.ByteLiteral, span, 0);
		}
		long value = Long.parseLong(text.substring(2).replace("_",""), radix);
		if (value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
			return new LiteralToken(TokenType.LongLiteral, span, value);
		} else if (value > Byte.MAX_VALUE || value < Byte.MIN_VALUE) {
			return new LiteralToken(TokenType.IntegerLiteral, span, (int) value);
		} else {
			return new LiteralToken(TokenType.ByteLiteral, span, (byte) value);
		}
	}

	private static boolean tokenizerString(CharacterStream stream, TokenType tokenType, List tokens) {
		// String literal
		if (stream.match(tokenType.getLiteral(), true)) {
			stream.startSpan();
			boolean matchedEndQuote = false;
			while (stream.hasMore()) {
				// Note: escape sequences like \n are parsed in StringLiteral
				if (stream.match("\\", true)) {
					stream.consume();
					continue;
				}
				if (stream.match(tokenType.getLiteral(), true)) {
					matchedEndQuote = true;
					break;
				}
				char ch = stream.consume();
				if (tokenType != TokenType.TripleQuote && (ch == '\r' || ch == '\n')) {
					MagicScriptError.error(tokenType.getError() + tokenType.getError() + "定义的字符串不能换行", stream.endSpan(), new StringLiteralException());
				}
			}
			if (!matchedEndQuote) {
				MagicScriptError.error("字符串没有结束符" + tokenType.getError(), stream.endSpan(), new StringLiteralException());
			}
			Span stringSpan = stream.endSpan();
			stringSpan = stream.getSpan(stringSpan.getStart(), stringSpan.getEnd() - tokenType.getLiteral().length());
			tokens.add(new LiteralToken(TokenType.StringLiteral, stringSpan));
			return true;
		}
		return false;
	}

	private static boolean regexpToken(CharacterStream stream, List tokens) {
		if (tokens.size() > 0) {
			Token token = tokens.get(tokens.size() - 1);
			if (token instanceof LiteralToken) {
				return false;
			}
			switch (token.getType()){
				case Comma :			// ,
				case Semicolon :		// ;
				case Colon:				// :
				case RightCurly:		// }
				case LeftBracket:		// [
				case LeftParantheses:	// (
				case Assignment:		// =
				case NotEqual:			// !=
				case EqualEqualEqual:	// ===
				case NotEqualEqual:		// !==
				case Equal:				// ==
				case And:				// &&
				case Or:				// ||
				case SqlAnd:			// and
				case SqlOr:				// or
				case SqlNotEqual:		// <>
				case QuestionMark:		// ?
				case Lambda:			// => ->
				case Not:				// !
					break;
				default: return false;
			}
		}
		if (stream.match("/", false)) {
			int mark = stream.getPosition();
			stream.consume();
			stream.startSpan();
			boolean matchedEndQuote = false;
			int deep = 0;
			int expFlag = 0;
			int maybeMissForwardSlash = 0;
			int maybeMissForwardSlashEnd = 0;
			while (stream.hasMore()) {
				// Note: escape sequences like \n are parsed in StringLiteral
				if (stream.match("\\", true)) {
					stream.consume();
					continue;
				}
				if (stream.match("[", false)) {
					deep++;
					maybeMissForwardSlash = stream.getPosition();
				} else if (deep > 0 && stream.match("]", false)) {
					deep--;
				} else if (stream.match(TokenType.ForwardSlash.getLiteral(), true)) {
					if (deep == 0) {
						if (stream.match("g", true)) {
							expFlag |= 1;
						}
						if (stream.match("i", true)) {
							expFlag |= Pattern.CASE_INSENSITIVE;
						}
						if (stream.match("m", true)) {
							expFlag |= Pattern.MULTILINE;
						}
						if (stream.match("s", true)) {
							expFlag |= Pattern.DOTALL;
						}
						if (stream.match("u", true)) {
							expFlag |= Pattern.UNICODE_CHARACTER_CLASS;
						}
						if (stream.match("y", true)) {
							expFlag |= 16;
						}
						matchedEndQuote = true;
						break;
					} else {
						maybeMissForwardSlashEnd = stream.getPosition();
					}
				}
				char ch = stream.consume();
				if (ch == '\r' || ch == '\n') {
					stream.reset(mark);
					return false;
				}
			}
			if (deep != 0) {
				MagicScriptError.error("Missing ']'", stream.getSpan(maybeMissForwardSlash, maybeMissForwardSlashEnd - 1));
			}
			if (!matchedEndQuote) {
				stream.reset(mark);
				return false;
			}
			Span regexpSpan = stream.endSpan();
			regexpSpan = stream.getSpan(regexpSpan.getStart() - 1, regexpSpan.getEnd());
			tokens.add(new RegexpToken(TokenType.RegexpLiteral, regexpSpan, expFlag));
			return true;
		}
		return false;
	}
}