
org.bidib.wizard.highlight.JavaScanner Maven / Gradle / Ivy
package org.bidib.wizard.highlight;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// Public domain, no restrictions, Ian Holyer, University of Bristol.
/**
*
* Provide a hand-written scanner for the Java language.
*/
public class JavaScanner extends Scanner {
private static final Logger LOGGER = LoggerFactory.getLogger(JavaScanner.class);
// The version of Java supported.
private int version = 15;
private final boolean debug = false;
/** Create a Java scanner, for Java version 1.5 by default. */
public JavaScanner() {
super();
initKind();
initUniKind();
}
/** Create a Java scanner, for a given version between "1.1" and "1.5". */
public JavaScanner(String version) {
super();
initKind();
initUniKind();
if (version.equals("1.1")) {
this.version = 11;
}
else if (version.equals("1.2")) {
this.version = 12;
}
else if (version.equals("1.3")) {
this.version = 13;
}
else if (version.equals("1.4")) {
this.version = 14;
}
else if (version.equals("1.5")) {
this.version = 15;
}
else {
throw new Error("Unknown version of Java: " + version);
}
}
/** Override the read method from the Scanner class. */
@Override
protected int read() {
int type;
int saveStart = 0;
if (debug) {
saveStart = start;
}
if (start >= end) {
return WHITESPACE;
}
switch (state) {
case MID_COMMENT:
case END_COMMENT:
type = readComment(MID_COMMENT);
if (type == END_COMMENT) {
state = WHITESPACE;
}
else {
state = MID_COMMENT;
}
return type;
default:
char c = buffer[start];
if (c == '\\') {
c = next();
}
if (c < 128) {
type = KIND[c];
}
else {
type = UNIKIND[Character.getType(c)];
}
switch (type) {
case WHITESPACE:
start = start + charlength;
charlength = 1;
while (start < end) {
c = buffer[start];
if (c == '\\') {
c = next();
}
int k;
if (c < 128) {
k = KIND[c];
}
else {
k = UNIKIND[Character.getType(c)];
}
if (k != WHITESPACE) {
break;
}
start = start + charlength;
charlength = 1;
}
break;
case UNRECOGNIZED:
case BRACKET:
case SEPARATOR:
start = start + charlength;
charlength = 1;
break;
case OPERATOR:
start = start + charlength;
charlength = 1;
type = readOperator(c);
break;
case CHARACTER:
start = start + charlength;
charlength = 1;
type = readCharLiteral();
break;
case STRING:
start = start + charlength;
charlength = 1;
type = readStringLiteral();
break;
case IDENTIFIER:
start = start + charlength;
charlength = 1;
while (start < end) {
c = buffer[start];
if (c == '\\') {
c = next();
}
int k;
if (c < 128) {
k = KIND[c];
}
else {
k = UNIKIND[Character.getType(c)];
}
if (k != IDENTIFIER && k != NUMBER) {
break;
}
start = start + charlength;
charlength = 1;
}
break;
case NUMBER:
start = start + charlength;
charlength = 1;
type = readNumber(c);
break;
case PUNCTUATION:
start = start + charlength;
charlength = 1;
type = readDot();
break;
case COMMENT:
start = start + charlength;
charlength = 1;
type = readSlash();
if (type == START_COMMENT) {
state = MID_COMMENT;
}
break;
default:
break;
}
}
if (LOGGER.isDebugEnabled()) {
StringBuilder sb = new StringBuilder();
sb.append(TokenTypes.TYPENAMES[type]).append(" ").append(saveStart).append(",").append(end).append("(")
.append((start - saveStart)).append(")");
LOGGER.debug(sb.toString());
}
return type;
}
private int readOperator(char c) {
if (start >= end) {
return OPERATOR;
}
char c2;
switch (c) {
case '~':
case '?':
case ':':
break;
case '+':
case '-':
case '&':
case '|':
c2 = buffer[start];
if (c2 == '\\') {
c2 = next();
}
if (c2 != c && c2 != '=') {
break;
}
start = start + charlength;
charlength = 1;
break;
case '=':
case '*':
case '!':
case '^':
case '%':
case '/':
c2 = buffer[start];
if (c2 == '\\') {
c2 = next();
}
if (c2 != '=') {
break;
}
start = start + charlength;
charlength = 1;
break;
case '<':
case '>':
c2 = buffer[start];
if (c2 == '\\') {
c2 = next();
}
if (c2 == '=') {
start = start + charlength;
charlength = 1;
}
else if (c2 == c) {
start = start + charlength;
charlength = 1;
if (start >= end) {
break;
}
char c3 = buffer[start];
if (c3 == '\\') {
c3 = next();
}
if (c3 == '=') {
start = start + charlength;
charlength = 1;
}
else if (c == '>' && c3 == '>') // >>>
{
start = start + charlength;
charlength = 1;
if (start >= end) {
break;
}
char c4 = buffer[start];
if (c4 == '\\') {
c4 = next();
}
if (c4 != '=') {
break;
}
start = start + charlength;
charlength = 1;
}
}
break;
default:
break;
}
return OPERATOR;
}
private int readCharLiteral() {
if (start >= end) {
return bad(CHARACTER);
}
char c2 = buffer[start];
if (c2 == '\\') {
c2 = next();
}
switch (c2) {
case '\\':
start = start + charlength;
charlength = 1;
boolean ok = readEscapeSequence();
if (!ok) {
return bad(CHARACTER);
}
break;
case '\'':
case '\n':
return bad(CHARACTER);
default:
start = start + charlength;
charlength = 1;
break;
}
if (start >= end) {
return bad(CHARACTER);
}
char c3 = buffer[start];
if (c3 == '\\') {
c3 = next();
}
if (c3 != '\'') {
return bad(CHARACTER);
}
start = start + charlength;
charlength = 1;
return CHARACTER;
}
private int readStringLiteral() {
if (start >= end) {
return bad(STRING);
}
char c = buffer[start];
if (c == '\\') {
c = next();
}
while (c != '"') {
switch (c) {
case '\\':
start = start + charlength;
charlength = 1;
boolean ok = readEscapeSequence();
if (!ok) {
return bad(STRING);
}
break;
case '\n':
return bad(STRING);
default:
start = start + charlength;
charlength = 1;
if (start >= end) {
return bad(STRING);
}
break;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
}
if (c != '"') {
return bad(STRING);
}
start = start + charlength;
charlength = 1;
return STRING;
}
private int readSlash() {
if (start >= end) {
return OPERATOR;
}
char c = buffer[start];
if (c == '\\') {
c = next();
}
if (c == '/') {
while (c != '\n') {
start = start + charlength;
charlength = 1;
if (start >= end) {
return COMMENT;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
}
start = start + charlength;
charlength = 1;
return COMMENT;
}
else if (c == '*') {
start = start + charlength;
charlength = 1;
return readComment(START_COMMENT);
}
return readOperator('/');
}
// Read one line of a /*...*/ comment, given the expected type
int readComment(int type) {
if (start >= end) {
return type;
}
char c = buffer[start];
if (c == '\\') {
c = next();
}
while (true) {
while (c != '*' && c != '\n') {
start = start + charlength;
charlength = 1;
if (start >= end) {
return type;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
}
start = start + charlength;
charlength = 1;
if (c == '\n') {
return type;
}
if (start >= end) {
return type;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
if (c == '/') {
start = start + charlength;
charlength = 1;
if (type == START_COMMENT) {
return COMMENT;
}
else {
return END_COMMENT;
}
}
}
}
// Read a number, without checking whether it is out of range
// Doesn't deal with e.g. 0777.9 or 07779f
private int readNumber(char c) {
if (c == '0') {
int saveStart = start;
int saveLength = charlength;
start = start + charlength;
charlength = 1;
if (start >= end) {
return NUMBER;
}
char c2 = buffer[start];
if (c2 == '\\') {
c2 = next();
}
switch (c2) {
case 'x':
case 'X':
start = start + charlength;
charlength = 1;
boolean ok = readDigits(16);
if (!ok) {
return bad(NUMBER);
}
readSuffix();
return NUMBER;
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
readDigits(8);
readSuffix();
return NUMBER;
case '.':
case 'e':
case 'E':
start = saveStart;
charlength = saveLength;
break;
case 'f':
case 'F':
case 'd':
case 'D':
start = start + charlength; // NOSONAR
charlength = 1;
return NUMBER;
case 'l':
case 'L':
start = start + charlength; // NOSONAR
charlength = 1;
return NUMBER;
default:
break;
}
}
boolean hasDigits = false;
if ('0' <= c && c <= '9') {
hasDigits = true;
readDigits(10);
if (start >= end) {
return NUMBER;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
if (c == 'l' || c == 'L') {
start = start + charlength;
charlength = 1;
return NUMBER;
}
}
if (c == '.') {
start = start + charlength;
charlength = 1;
if (start >= end) {
return NUMBER;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
if ('0' <= c && c <= '9') {
hasDigits = true;
readDigits(10);
if (start >= end) {
return NUMBER;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
}
}
if (!hasDigits) {
return bad(NUMBER);
}
switch (c) {
case 'e':
case 'E':
start = start + charlength;
charlength = 1;
if (start >= end) {
return bad(NUMBER);
}
c = buffer[start];
if (c == '\\') {
c = next();
}
if (c == '+' || c == '-') {
start = start + charlength;
charlength = 1;
if (start >= end) {
return bad(NUMBER);
}
c = buffer[start];
if (c == '\\') {
next();
}
}
readDigits(10);
break;
case 'f':
case 'F':
case 'd':
case 'D':
start = start + charlength;
charlength = 1;
return NUMBER;
default:
break;
}
return NUMBER;
}
boolean readDigits(int radix) {
if (start >= end) {
return false;
}
char c = buffer[start];
if (c == '\\') {
c = next();
}
if (Character.digit(c, radix) == -1) {
return false;
}
while (Character.digit(c, radix) != -1) {
start = start + charlength;
charlength = 1;
if (start >= end) {
return true;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
}
return true;
}
void readSuffix() {
if (start >= end) {
return;
}
char c = buffer[start];
if (c == '\\') {
c = next();
}
switch (c) {
case 'f':
case 'F':
case 'd':
case 'D':
case 'l':
case 'L':
start = start + charlength;
charlength = 1;
default:
break;
}
}
private int readDot() {
if (start >= end) {
return SEPARATOR;
}
char c2 = buffer[start];
if (c2 == '\\') {
c2 = next();
}
if (Character.isDigit(c2)) {
return readNumber('.');
}
if (start + 1 >= end || version < 15) {
return SEPARATOR;
}
if (c2 != '.' || buffer[start + 1] != '.') {
return SEPARATOR;
}
start = start + 2;
return SEPARATOR;
}
private boolean readEscapeSequence() {
if (start >= end) {
return false;
}
char c2 = buffer[start];
if (c2 == '\\') {
c2 = next();
}
switch (c2) {
case 'b':
case 't':
case 'n':
case 'f':
case 'r':
case '\"':
case '\'':
case '\\':
start = start + charlength;
charlength = 1;
return true;
case '0':
case '1':
case '2':
case '3':
return readOctal(3);
case '4':
case '5':
case '6':
case '7':
return readOctal(2);
default:
return false;
}
}
boolean readOctal(int maxlength) {
if (start >= end) {
return false;
}
char c = buffer[start];
if (c == '\\') {
c = next();
}
int i;
int val = 0;
for (i = 0; i < maxlength; i++) {
if (Character.digit(c, 8) != -1) {
val = 8 * val + Character.digit(c, 8);
start = start + charlength;
charlength = 1;
if (start >= end) {
break;
}
c = buffer[start];
if (c == '\\') {
c = next();
}
}
else {
break;
}
}
if ((i == 0) || (val > 0xFF)) {
return false;
}
return true;
}
// A malformed or incomplete token has a negative type
private int bad(int type) {
return -type;
}
// Look ahead at the next character or unicode escape.
// For efficiency, replace c = next(); with
// c = buffer[start]; if (c == '\\') c = next();
// To accept the character after looking at it, use:
// start = start + charlength; charlength = 1;
// Record the number of source code characters used up. To deal with an
// odd or even number of backslashes preceding a unicode escape, whenever a
// second backslash is coming up, mark its position as a pair.
private int charlength = 1;
private int pair = 0;
private char next() {
if (start >= end) {
return 26; // EOF
}
char c = buffer[start];
if (c != '\\') {
return c;
}
if (start == pair) {
pair = 0;
return '\\';
}
if (start + 1 >= end) {
return '\\';
}
c = buffer[start + 1];
if (c == '\\') {
pair = start + 1;
}
if (c != 'u') {
return '\\';
}
int pos = start + 2;
while (pos < end && buffer[pos] == 'u') {
pos++;
}
if (pos + 4 > end) {
charlength = end - start;
return '\0';
}
c = 0;
for (int j = 0; j < 4; j++) {
int d = Character.digit(buffer[pos + j], 16);
if (d < 0) {
charlength = pos + j - start;
return '\0';
}
c = (char) (c * 16 + d);
}
charlength = pos + 4 - start;
return c;
}
// Override initSymbolTable
@Override
protected void initSymbolTable() {
lookup(KEYWORD, "abstract");
if (version >= 14) {
lookup(KEYWORD, "assert");
}
lookup(KEYWORD, "boolean");
lookup(KEYWORD, "break");
lookup(KEYWORD, "byte");
lookup(KEYWORD, "case");
lookup(KEYWORD, "catch");
lookup(KEYWORD, "char");
lookup(KEYWORD, "class");
lookup(KEYWORD, "const");
lookup(KEYWORD, "continue");
lookup(KEYWORD, "default");
lookup(KEYWORD, "do");
lookup(KEYWORD, "double");
lookup(KEYWORD, "else");
if (version >= 15) {
lookup(KEYWORD, "enum");
}
lookup(KEYWORD, "extends");
lookup(KEYWORD, "final");
lookup(KEYWORD, "finally");
lookup(KEYWORD, "float");
lookup(KEYWORD, "for");
lookup(KEYWORD, "goto");
lookup(KEYWORD, "if");
lookup(KEYWORD, "implements");
lookup(KEYWORD, "import");
lookup(KEYWORD, "instanceof");
lookup(KEYWORD, "int");
lookup(KEYWORD, "interface");
lookup(KEYWORD, "long");
lookup(KEYWORD, "native");
lookup(KEYWORD, "new");
lookup(KEYWORD, "package");
lookup(KEYWORD, "private");
lookup(KEYWORD, "protected");
lookup(KEYWORD, "public");
lookup(KEYWORD, "return");
lookup(KEYWORD, "short");
lookup(KEYWORD, "static");
if (version >= 12) {
lookup(KEYWORD, "strictfp");
}
lookup(KEYWORD, "super");
lookup(KEYWORD, "switch");
lookup(KEYWORD, "synchronized");
lookup(KEYWORD, "this");
lookup(KEYWORD, "throw");
lookup(KEYWORD, "throws");
lookup(KEYWORD, "transient");
lookup(KEYWORD, "try");
lookup(KEYWORD, "void");
lookup(KEYWORD, "volatile");
lookup(KEYWORD, "while");
lookup(KEYWORD2, "String");
lookup(KEYWORD2, "Integer");
lookup(KEYWORD2, "Long");
lookup(KEYWORD2, "Short");
lookup(KEYWORD2, "Boolean");
lookup(KEYWORD2, "Character");
lookup(KEYWORD2, "Double");
lookup(KEYWORD2, "Float");
lookup(LITERAL, "true");
lookup(LITERAL, "false");
lookup(LITERAL, "null");
}
// *** Override lookup, but what about unicode escape translation?
private final Symbol temp = new Symbol(0, null);
@Override
protected Symbol lookup(int type, String name) {
if (type != IDENTIFIER) {
return super.lookup(type, name);
}
temp.type = KEYWORD;
temp.name = name;
Symbol sym = symbolTable.get(temp);
if (sym != null) {
return sym;
}
temp.type = KEYWORD2;
temp.name = name;
sym = symbolTable.get(temp);
if (sym != null) {
return sym;
}
temp.type = LITERAL;
sym = symbolTable.get(temp);
if (sym != null) {
return sym;
}
return super.lookup(type, name);
}
// Classify the ascii characters using an array of kinds, and classify all
// other unicode characters using an array indexed by unicode category.
// See the source file java/lang/Character.java for the categories.
// To find the classification of a character, use:
// if (c < 128) k = kind[c]; else k = unikind[Character.getType(c)];
private static final byte[] KIND = new byte[128];
private static final byte[] UNIKIND = new byte[31];
// Initialise the two classification arrays using static initializer code.
// Token types from the TokenTypes class are used to classify characters.
private void initKind() {
for (char c = 0; c < 128; c++) {
KIND[c] = -1;
}
for (char c = 0; c < 128; c++) {
switch (c) {
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
case 8:
case 11:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
case 20:
case 21:
case 22:
case 23:
case 24:
case 25:
case 27:
case 28:
case 29:
case 30:
case 31:
case 127:
case '#':
case '@':
case '`':
case '\\':
KIND[c] = UNRECOGNIZED;
break;
case '\t':
case '\n':
case ' ':
case '\f':
case 26:
KIND[c] = WHITESPACE;
break;
case '!':
case '%':
case '&':
case '*':
case '+':
case '-':
case ':':
case '<':
case '=':
case '>':
case '?':
case '^':
case '|':
case '~':
KIND[c] = OPERATOR;
break;
case '"':
KIND[c] = STRING;
break;
case '\'':
KIND[c] = CHARACTER;
break;
case '.':
KIND[c] = PUNCTUATION;
break;
case '/':
KIND[c] = COMMENT;
break;
case '$':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
KIND[c] = IDENTIFIER;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
KIND[c] = NUMBER;
break;
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
KIND[c] = BRACKET;
break;
case ',':
case ';':
KIND[c] = SEPARATOR;
break;
default:
break;
}
}
for (char c = 0; c < 128; c++) {
if (KIND[c] == -1) {
LOGGER.debug("Char " + ((int) c) + " hasn't been classified");
}
}
}
private void initUniKind() {
for (byte b = 0; b < 31; b++) {
UNIKIND[b] = -1;
}
for (byte b = 0; b < 31; b++) {
switch (b) {
case Character.UNASSIGNED:
case Character.ENCLOSING_MARK:
case Character.OTHER_NUMBER:
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case 17: // category 17 is unused
case Character.PRIVATE_USE:
case Character.SURROGATE:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
UNIKIND[b] = UNRECOGNIZED;
break;
case Character.UPPERCASE_LETTER:
case Character.LOWERCASE_LETTER:
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
case Character.OTHER_LETTER:
case Character.LETTER_NUMBER:
case Character.CONNECTOR_PUNCTUATION: // maybe NUMBER
case Character.CURRENCY_SYMBOL:
// Characters where Other_ID_Start is true
UNIKIND[b] = IDENTIFIER;
break;
case Character.NON_SPACING_MARK:
case Character.COMBINING_SPACING_MARK:
case Character.DECIMAL_DIGIT_NUMBER:
case Character.FORMAT:
UNIKIND[b] = NUMBER;
break;
default:
break;
}
}
for (byte b = 0; b < 31; b++) {
if (UNIKIND[b] == -1) {
LOGGER.debug("Unicode cat " + b + " hasn't been classified");
}
}
}
}