com.sharksharding.sql.parser.Lexer Maven / Gradle / Ivy
The newest version!
/*
* Copyright 1999-2101 Alibaba Group Holding Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.sharksharding.sql.parser;
import static com.sharksharding.sql.parser.CharTypes.isFirstIdentifierChar;
import static com.sharksharding.sql.parser.CharTypes.isIdentifierChar;
import static com.sharksharding.sql.parser.CharTypes.isWhitespace;
import static com.sharksharding.sql.parser.LayoutCharacters.EOI;
import static com.sharksharding.sql.parser.Token.COLONCOLON;
import static com.sharksharding.sql.parser.Token.COLONEQ;
import static com.sharksharding.sql.parser.Token.COMMA;
import static com.sharksharding.sql.parser.Token.EOF;
import static com.sharksharding.sql.parser.Token.ERROR;
import static com.sharksharding.sql.parser.Token.LBRACE;
import static com.sharksharding.sql.parser.Token.LBRACKET;
import static com.sharksharding.sql.parser.Token.LITERAL_ALIAS;
import static com.sharksharding.sql.parser.Token.LITERAL_CHARS;
import static com.sharksharding.sql.parser.Token.LPAREN;
import static com.sharksharding.sql.parser.Token.RBRACE;
import static com.sharksharding.sql.parser.Token.RBRACKET;
import static com.sharksharding.sql.parser.Token.RPAREN;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* @author wenshao
*/
public class Lexer {
protected final String text;
protected int pos;
protected int mark;
protected char ch;
protected char[] buf;
protected int bufPos;
protected Token token;
protected Keywords keywods = Keywords.DEFAULT_KEYWORDS;
protected String stringVal;
protected List comments = new ArrayList(2);
protected boolean skipComment = true;
private SavePoint savePoint = null;
/*
* anti sql injection
*/
private boolean allowComment = true;
private int varIndex = -1;
protected CommentHandler commentHandler;
protected boolean endOfComment = false;
protected boolean keepComments = false;
protected int line = 0;
protected int lines = 0;
public Lexer(String input){
this(input, null);
}
public Lexer(String input, CommentHandler commentHandler){
this(input, true);
this.commentHandler = commentHandler;
}
public boolean isKeepComments() {
return keepComments;
}
public void setKeepComments(boolean keepComments) {
this.keepComments = keepComments;
}
public CommentHandler getCommentHandler() {
return commentHandler;
}
public void setCommentHandler(CommentHandler commentHandler) {
this.commentHandler = commentHandler;
}
public final char charAt(int index) {
if (index >= text.length()) {
return EOI;
}
return text.charAt(index);
}
public final String addSymbol() {
return subString(mark, bufPos);
}
public final String subString(int offset, int count) {
return text.substring(offset, offset + count);
}
protected void initBuff(int size) {
if (buf == null) {
if (size < 32) {
buf = new char[32];
} else {
buf = new char[size + 32];
}
} else if (buf.length < size) {
buf = Arrays.copyOf(buf, size);
}
}
public void arraycopy(int srcPos, char[] dest, int destPos, int length) {
text.getChars(srcPos, srcPos + length, dest, destPos);
}
public boolean isAllowComment() {
return allowComment;
}
public void setAllowComment(boolean allowComment) {
this.allowComment = allowComment;
}
public int nextVarIndex() {
return ++varIndex;
}
private static class SavePoint {
int bp;
int sp;
int np;
char ch;
Token token;
}
public Keywords getKeywods() {
return keywods;
}
public void mark() {
SavePoint savePoint = new SavePoint();
savePoint.bp = pos;
savePoint.sp = bufPos;
savePoint.np = mark;
savePoint.ch = ch;
savePoint.token = token;
this.savePoint = savePoint;
}
public void reset() {
this.pos = savePoint.bp;
this.bufPos = savePoint.sp;
this.mark = savePoint.np;
this.ch = savePoint.ch;
this.token = savePoint.token;
}
public Lexer(String input, boolean skipComment){
this.skipComment = skipComment;
this.text = input;
this.pos = -1;
scanChar();
}
public Lexer(char[] input, int inputLength, boolean skipComment){
this(new String(input, 0, inputLength), skipComment);
}
protected final void scanChar() {
ch = charAt(++pos);
}
protected void unscan() {
ch = charAt(--pos);
}
public boolean isEOF() {
return pos >= text.length();
}
/**
* Report an error at the given position using the provided arguments.
*/
protected void lexError(String key, Object... args) {
token = ERROR;
}
/**
* Return the current token, set by nextToken().
*/
public final Token token() {
return token;
}
public String info() {
return this.token + " " + this.stringVal();
}
public final void nextTokenComma() {
if (ch == ' ') {
scanChar();
}
if (ch == ',' || ch == ',') {
scanChar();
token = COMMA;
return;
}
if (ch == ')') {
scanChar();
token = RPAREN;
return;
}
nextToken();
}
public final void nextTokenLParen() {
if (ch == ' ') {
scanChar();
}
if (ch == '(') {
scanChar();
token = LPAREN;
return;
}
nextToken();
}
public final void nextTokenValue() {
if (ch == ' ') {
scanChar();
}
if (ch == '\'') {
bufPos = 0;
scanString();
return;
}
if (ch >= '0' && ch <= '9') {
bufPos = 0;
scanNumber();
return;
}
if (ch == '?') {
scanChar();
token = Token.QUES;
return;
}
if (isFirstIdentifierChar(ch) && ch != 'N') {
scanIdentifier();
return;
}
nextToken();
}
public final void nextToken() {
bufPos = 0;
if (comments != null) {
comments = null;
}
this.lines = 0;
int startLine = line;
for (;;) {
if (isWhitespace(ch)) {
if (ch == '\n') {
line++;
lines = line - startLine;
}
scanChar();
continue;
}
if (ch == '$' && charAt(pos + 1) == '{') {
scanVariable();
return;
}
if (isFirstIdentifierChar(ch)) {
if (ch == 'N') {
if (charAt(pos + 1) == '\'') {
++pos;
ch = '\'';
scanString();
token = Token.LITERAL_NCHARS;
return;
}
}
scanIdentifier();
return;
}
switch (ch) {
case '0':
if (charAt(pos + 1) == 'x') {
scanChar();
scanChar();
scanHexaDecimal();
} else {
scanNumber();
}
return;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
scanNumber();
return;
case ',':
case ',':
scanChar();
token = COMMA;
return;
case '(':
scanChar();
token = LPAREN;
return;
case ')':
scanChar();
token = RPAREN;
return;
case '[':
scanLBracket();
return;
case ']':
scanChar();
token = RBRACKET;
return;
case '{':
scanChar();
token = LBRACE;
return;
case '}':
scanChar();
token = RBRACE;
return;
case ':':
scanChar();
if (ch == '=') {
scanChar();
token = COLONEQ;
} else if (ch == ':') {
scanChar();
token = COLONCOLON;
} else {
if (isDigit(ch)) {
unscan();
scanVariable();
} else {
unscan();
scanVariable();
}
}
return;
case '#':
scanSharp();
if ((token() == Token.LINE_COMMENT || token() == Token.MULTI_LINE_COMMENT) && skipComment) {
bufPos = 0;
continue;
}
return;
case '.':
scanChar();
if (isDigit(ch) && !isFirstIdentifierChar(charAt(pos - 2))) {
unscan();
scanNumber();
return;
} else if (ch == '.') {
scanChar();
if (ch == '.') {
scanChar();
token = Token.DOTDOTDOT;
} else {
token = Token.DOTDOT;
}
} else {
token = Token.DOT;
}
return;
case '\'':
scanString();
return;
case '\"':
scanAlias();
return;
case '*':
scanChar();
token = Token.STAR;
return;
case '?':
scanChar();
token = Token.QUES;
return;
case ';':
scanChar();
token = Token.SEMI;
return;
case '`':
throw new ParserException("TODO"); // TODO
case '@':
scanVariable();
return;
case '-':
int subNextChar = charAt(pos + 1);
if (subNextChar == '-') {
scanComment();
if ((token() == Token.LINE_COMMENT || token() == Token.MULTI_LINE_COMMENT) && skipComment) {
bufPos = 0;
continue;
}
} else {
scanOperator();
}
return;
case '/':
int nextChar = charAt(pos + 1);
if (nextChar == '/' || nextChar == '*') {
scanComment();
if ((token() == Token.LINE_COMMENT || token() == Token.MULTI_LINE_COMMENT) && skipComment) {
bufPos = 0;
continue;
}
} else {
token = Token.SLASH;
scanChar();
}
return;
default:
if (Character.isLetter(ch)) {
scanIdentifier();
return;
}
if (isOperator(ch)) {
scanOperator();
return;
}
// QS_TODO ?
if (isEOF()) { // JLS
token = EOF;
} else {
lexError("illegal.char", String.valueOf((int) ch));
scanChar();
}
return;
}
}
}
protected void scanLBracket() {
scanChar();
token = LBRACKET;
}
private final void scanOperator() {
switch (ch) {
case '+':
scanChar();
token = Token.PLUS;
break;
case '-':
scanChar();
token = Token.SUB;
break;
case '*':
scanChar();
token = Token.STAR;
break;
case '/':
scanChar();
token = Token.SLASH;
break;
case '&':
scanChar();
if (ch == '&') {
scanChar();
token = Token.AMPAMP;
} else {
token = Token.AMP;
}
break;
case '|':
scanChar();
if (ch == '|') {
scanChar();
if (ch == '/') {
scanChar();
token = Token.BARBARSLASH;
} else {
token = Token.BARBAR;
}
} else if (ch == '/') {
scanChar();
token = Token.BARSLASH;
} else {
token = Token.BAR;
}
break;
case '^':
scanChar();
token = Token.CARET;
break;
case '%':
scanChar();
token = Token.PERCENT;
break;
case '=':
scanChar();
if (ch == '=') {
scanChar();
token = Token.EQEQ;
} else {
token = Token.EQ;
}
break;
case '>':
scanChar();
if (ch == '=') {
scanChar();
token = Token.GTEQ;
} else if (ch == '>') {
scanChar();
token = Token.GTGT;
} else {
token = Token.GT;
}
break;
case '<':
scanChar();
if (ch == '=') {
scanChar();
if (ch == '>') {
token = Token.LTEQGT;
scanChar();
} else {
token = Token.LTEQ;
}
} else if (ch == '>') {
scanChar();
token = Token.LTGT;
} else if (ch == '<') {
scanChar();
token = Token.LTLT;
} else {
token = Token.LT;
}
break;
case '!':
scanChar();
if (ch == '=') {
scanChar();
token = Token.BANGEQ;
} else if (ch == '>') {
scanChar();
token = Token.BANGGT;
} else if (ch == '<') {
scanChar();
token = Token.BANGLT;
} else if (ch == '!') {
scanChar();
token = Token.BANGBANG; // postsql
} else {
token = Token.BANG;
}
break;
case '?':
scanChar();
token = Token.QUES;
break;
case '~':
scanChar();
token = Token.TILDE;
break;
default:
throw new ParserException("TODO");
}
}
protected void scanString() {
mark = pos;
boolean hasSpecial = false;
for (;;) {
if (isEOF()) {
lexError("unclosed.str.lit");
return;
}
ch = charAt(++pos);
if (ch == '\'') {
scanChar();
if (ch != '\'') {
token = LITERAL_CHARS;
break;
} else {
if (!hasSpecial) {
initBuff(bufPos);
arraycopy(mark + 1, buf, 0, bufPos);
hasSpecial = true;
}
putChar('\'');
continue;
}
}
if (!hasSpecial) {
bufPos++;
continue;
}
if (bufPos == buf.length) {
putChar(ch);
} else {
buf[bufPos++] = ch;
}
}
if (!hasSpecial) {
stringVal = subString(mark + 1, bufPos);
} else {
stringVal = new String(buf, 0, bufPos);
}
}
private final void scanAlias() {
mark = pos;
if (buf == null) {
buf = new char[32];
}
boolean hasSpecial = false;
for (;;) {
if (isEOF()) {
lexError("unclosed.str.lit");
return;
}
ch = charAt(++pos);
if (ch == '\"' && charAt(pos - 1) != '\\') {
scanChar();
token = LITERAL_ALIAS;
break;
}
if(ch == '\\') {
scanChar();
if (ch == '"') {
hasSpecial = true;
} else {
unscan();
}
}
if (bufPos == buf.length) {
putChar(ch);
} else {
buf[bufPos++] = ch;
}
}
if (!hasSpecial) {
stringVal = subString(mark + 1, bufPos);
} else {
stringVal = new String(buf, 0, bufPos);
}
//stringVal = subString(mark + 1, bufPos);
}
public void scanSharp() {
scanVariable();
}
public void scanVariable() {
if (ch != '@' && ch != ':' && ch != '#' && ch != '$') {
throw new ParserException("illegal variable");
}
mark = pos;
bufPos = 1;
char ch;
if (charAt(pos + 1) == '@') {
ch = charAt(++pos);
bufPos++;
} else if (charAt(pos + 1) == '{') {
pos++;
bufPos++;
for (;;) {
ch = charAt(++pos);
if (ch == '}') {
break;
}
bufPos++;
continue;
}
if (ch != '}') {
throw new ParserException("syntax error");
}
++pos;
bufPos++;
this.ch = charAt(pos);
stringVal = addSymbol();
token = Token.VARIANT;
return;
}
for (;;) {
ch = charAt(++pos);
if (!isIdentifierChar(ch)) {
break;
}
bufPos++;
continue;
}
this.ch = charAt(pos);
stringVal = addSymbol();
token = Token.VARIANT;
}
public void scanComment() {
if (!allowComment) {
throw new NotAllowCommentException();
}
if ((ch == '/' && charAt(pos + 1) == '/')
|| (ch == '-' && charAt(pos + 1) == '-')) {
scanSingleLineComment();
} else if (ch == '/' && charAt(pos + 1) == '*') {
scanMultiLineComment();
} else {
throw new IllegalStateException();
}
}
private void scanMultiLineComment() {
Token lastToken = this.token;
scanChar();
scanChar();
mark = pos;
bufPos = 0;
for (;;) {
if (ch == '*' && charAt(pos + 1) == '/') {
scanChar();
scanChar();
break;
}
// multiline comment结束符错误
if (ch == EOI) {
throw new ParserException("unterminated /* comment.");
}
scanChar();
bufPos++;
}
stringVal = subString(mark, bufPos);
token = Token.MULTI_LINE_COMMENT;
if (keepComments) {
addComment(stringVal);
}
if (commentHandler != null && commentHandler.handle(lastToken, stringVal)) {
return;
}
if (!isAllowComment() && !isSafeComment(stringVal)) {
throw new NotAllowCommentException();
}
}
private void scanSingleLineComment() {
Token lastToken = this.token;
scanChar();
scanChar();
mark = pos - 1;
bufPos = 0;
for (;;) {
if (ch == '\r') {
if (charAt(pos + 1) == '\n') {
line++;
scanChar();
break;
}
bufPos++;
break;
}
if (ch == '\n') {
line++;
scanChar();
break;
}
// single line comment结束符错误
if (ch == EOI) {
throw new ParserException("syntax error at end of input.");
}
scanChar();
bufPos++;
}
stringVal = subString(mark, bufPos);
token = Token.LINE_COMMENT;
if (keepComments) {
addComment(stringVal);
}
if (commentHandler != null && commentHandler.handle(lastToken, stringVal)) {
return;
}
if (!isAllowComment() && !isSafeComment(stringVal)) {
throw new NotAllowCommentException();
}
}
public void scanIdentifier() {
final char first = ch;
final boolean firstFlag = isFirstIdentifierChar(first);
if (!firstFlag) {
throw new ParserException("illegal identifier");
}
mark = pos;
bufPos = 1;
char ch;
for (;;) {
ch = charAt(++pos);
if (!isIdentifierChar(ch)) {
break;
}
bufPos++;
continue;
}
this.ch = charAt(pos);
stringVal = addSymbol();
Token tok = keywods.getKeyword(stringVal);
if (tok != null) {
token = tok;
} else {
token = Token.IDENTIFIER;
}
}
public void scanNumber() {
mark = pos;
if (ch == '-') {
bufPos++;
ch = charAt(++pos);
}
for (;;) {
if (ch >= '0' && ch <= '9') {
bufPos++;
} else {
break;
}
ch = charAt(++pos);
}
boolean isDouble = false;
if (ch == '.') {
if (charAt(pos + 1) == '.') {
token = Token.LITERAL_INT;
return;
}
bufPos++;
ch = charAt(++pos);
isDouble = true;
for (;;) {
if (ch >= '0' && ch <= '9') {
bufPos++;
} else {
break;
}
ch = charAt(++pos);
}
}
if (ch == 'e' || ch == 'E') {
bufPos++;
ch = charAt(++pos);
if (ch == '+' || ch == '-') {
bufPos++;
ch = charAt(++pos);
}
for (;;) {
if (ch >= '0' && ch <= '9') {
bufPos++;
} else {
break;
}
ch = charAt(++pos);
}
isDouble = true;
}
if (isDouble) {
token = Token.LITERAL_FLOAT;
} else {
token = Token.LITERAL_INT;
}
}
public void scanHexaDecimal() {
mark = pos;
if (ch == '-') {
bufPos++;
ch = charAt(++pos);
}
for (;;) {
if (CharTypes.isHex(ch)) {
bufPos++;
} else {
break;
}
ch = charAt(++pos);
}
token = Token.LITERAL_HEX;
}
public String hexString() {
return subString(mark, bufPos);
}
public final boolean isDigit(char ch) {
return ch >= '0' && ch <= '9';
}
/**
* Append a character to sbuf.
*/
protected final void putChar(char ch) {
if (bufPos == buf.length) {
char[] newsbuf = new char[buf.length * 2];
System.arraycopy(buf, 0, newsbuf, 0, buf.length);
buf = newsbuf;
}
buf[bufPos++] = ch;
}
/**
* Return the current token's position: a 0-based offset from beginning of the raw input stream (before unicode
* translation)
*/
public final int pos() {
return pos;
}
/**
* The value of a literal token, recorded as a string. For integers, leading 0x and 'l' suffixes are suppressed.
*/
public final String stringVal() {
return stringVal;
}
public final List readAndResetComments() {
List comments = this.comments;
this.comments = null;
return comments;
}
private boolean isOperator(char ch) {
switch (ch) {
case '!':
case '%':
case '&':
case '*':
case '+':
case '-':
case '<':
case '=':
case '>':
case '^':
case '|':
case '~':
case ';':
return true;
default:
return false;
}
}
private static final long MULTMIN_RADIX_TEN = Long.MIN_VALUE / 10;
private static final long N_MULTMAX_RADIX_TEN = -Long.MAX_VALUE / 10;
private final static int[] digits = new int[(int) '9' + 1];
static {
for (int i = '0'; i <= '9'; ++i) {
digits[i] = i - '0';
}
}
// QS_TODO negative number is invisible for lexer
public Number integerValue() {
long result = 0;
boolean negative = false;
int i = mark, max = mark + bufPos;
long limit;
long multmin;
int digit;
if (charAt(mark) == '-') {
negative = true;
limit = Long.MIN_VALUE;
i++;
} else {
limit = -Long.MAX_VALUE;
}
multmin = negative ? MULTMIN_RADIX_TEN : N_MULTMAX_RADIX_TEN;
if (i < max) {
digit = digits[charAt(i++)];
result = -digit;
}
while (i < max) {
// Accumulating negatively avoids surprises near MAX_VALUE
digit = digits[charAt(i++)];
if (result < multmin) {
return new BigInteger(numberString());
}
result *= 10;
if (result < limit + digit) {
return new BigInteger(numberString());
}
result -= digit;
}
if (negative) {
if (i > mark + 1) {
if (result >= Integer.MIN_VALUE) {
return (int) result;
}
return result;
} else { /* Only got "-" */
throw new NumberFormatException(numberString());
}
} else {
result = -result;
if (result <= Integer.MAX_VALUE) {
return (int) result;
}
return result;
}
}
public int bp() {
return this.pos;
}
public char current() {
return this.ch;
}
public void reset(int mark, char markChar, Token token) {
this.pos = mark;
this.ch = markChar;
this.token = token;
}
public final String numberString() {
return subString(mark, bufPos);
}
public BigDecimal decimalValue() {
return new BigDecimal(subString(mark, bufPos).toCharArray());
}
public static interface CommentHandler {
boolean handle(Token lastToken, String comment);
}
public boolean hasComment() {
return comments != null;
}
public void skipToEOF() {
pos = text.length();
this.token = Token.EOF;
}
public boolean isEndOfComment() {
return endOfComment;
}
protected boolean isSafeComment(String comment) {
if (comment == null) {
return true;
}
comment = comment.toLowerCase();
if (comment.indexOf("select") != -1 //
|| comment.indexOf("delete") != -1 //
|| comment.indexOf("insert") != -1 //
|| comment.indexOf("update") != -1 //
|| comment.indexOf("into") != -1 //
|| comment.indexOf("where") != -1 //
|| comment.indexOf("or") != -1 //
|| comment.indexOf("and") != -1 //
|| comment.indexOf("union") != -1 //
|| comment.indexOf('\'') != -1 //
|| comment.indexOf('=') != -1 //
|| comment.indexOf('>') != -1 //
|| comment.indexOf('<') != -1 //
|| comment.indexOf('&') != -1 //
|| comment.indexOf('|') != -1 //
|| comment.indexOf('^') != -1 //
) {
return false;
}
return true;
}
protected void addComment(String comment) {
if (comments == null) {
comments = new ArrayList(2);
}
comments.add(stringVal);
}
public int getLine() {
return line;
}
}