org.netbeans.lib.xml.lexer.DTDLexer Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.netbeans.lib.xml.lexer;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.netbeans.api.lexer.PartType;
import org.netbeans.api.lexer.Token;
import org.netbeans.api.xml.lexer.DTDTokenId;
import org.netbeans.spi.lexer.Lexer;
import org.netbeans.spi.lexer.LexerInput;
import org.netbeans.spi.lexer.LexerRestartInfo;
import org.netbeans.spi.lexer.TokenFactory;
/**
* New simple implementation of DTD lexer. Unlike XML lexer, this one does not attempt
* to report erroneous token based on context; so keyword is a keyword (almost) on all places. Syntax
* analyzer is needed to report premature ends or invalid combinations / sequences of tokens.
*
* @author sdedic
*/
public class DTDLexer implements Lexer {
/**
* Default state outside declarations or instructions
*/
private static final int ISI_INIT = 0;
/**
* Plaintext outside declarations
*/
private static final int ISI_TEXT = 1;
/**
* &? processing instruction
*/
private static final int ISI_PROCESSING_INSTR = 2;
/**
* DTD declaration
*/
private static final int ISI_DECLARATION = 3;
private static final int ISI_ENTITY = 4;
private static final int ISI_ELEMENT = 5;
private static final int ISI_ATTLIST = 6;
private static final int ISI_NOTATION = 7;
/**
* Comment
*/
private static final int ISI_COMMENT = 8;
/**
* Declaration processing; after initial symbol & before the target
*/
private static final int SUB_PROCESSING_TARGET = 0;
/**
* Declaration processing; after target - content is being processed
*/
private static final int SUB_PROCESSING_CONTENT = 1;
/**
* XML declaration is being processed; in between attributes, after
* attribute name and before value
*/
private static final int SUB_PROCESSING_XML = 2;
private static final int SUB_DECLARATION_NAME = 0;
private static final int SUB_DECLARATION_DEF = 1;
private static final int SUB_VALUE_QUOTE = 5;
private static final int SUB_VALUE_DOUBLE = 6;
private LexerInput input;
private int state;
private int substate;
private int intrSubstate;
private TokenFactory tokenFactory;
private Token unterminatedFoundOpen() {
int ch = input.read();
switch (ch) {
case '?':
setState(ISI_PROCESSING_INSTR, SUB_PROCESSING_TARGET);
return error();
case '!':
ch = input.read();
if (Character.isAlphabetic(ch)) {
// directive; symbol
setState(ISI_DECLARATION);
return error();
} else if (ch == '-') { // NOI18N
// potential comment
ch = input.read();
if (ch == '-') { // NOI18N
setState(ISI_COMMENT);
return error();
}
}
input.backup(1);
return error();
}
input.backup(1);
return error();
}
public Token nextTokenInit() {
int ch = input.read();
Token tukac = null;
switch (ch) {
case '<': { // NOI18N
ch = input.read();
switch (ch) {
case '?': // NOI18N
// processing instruction; symbol
setState(ISI_PROCESSING_INSTR, SUB_PROCESSING_TARGET);
return tokenFactory.createToken(DTDTokenId.SYMBOL);
case '!': { // NOI18N
ch = input.read();
if (Character.isAlphabetic(ch)) {
// directive; symbol
setState(ISI_DECLARATION);
input.backup(1);
return tokenFactory.createToken(DTDTokenId.SYMBOL);
} else if (ch == '-') { // NOI18N
// potential comment
ch = input.read();
if (ch == '-') { // NOI18N
return skipComment();
}
}
input.backup(1);
return tokenFactory.createToken(DTDTokenId.ERROR);
}
default:
// fall through
break;
}
}
case '&':
tukac = processEntityOrCharacterRef();
return tukac == null ? error() : tukac;
case '%':
tukac = processParsedEntity();
return tukac == null ? error() : tukac;
default:
// fall through
break;
}
if (tukac != null) {
return tukac;
}
return nextTokenContent();
}
private Token processCharacterReference() {
int ch = input.read();
boolean hex = ch == 'x';
if (hex) {
ch = input.read();
}
boolean first = true;
do {
if (ch == ';') {
break;
}
if (!((ch >= '0' && ch <= '9') ||
hex && ((ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f')))) {
return null;
}
first = false;
} while ((ch = input.read()) != LexerInput.EOF);
return createReferenceToken(first ? DTDTokenId.ERROR : DTDTokenId.CHARACTER);
}
private Token createReferenceToken(DTDTokenId id) {
Token t = intrSubstate != 0 ?
tokenFactory.createToken(id, input.readLength(), PartType.MIDDLE) :
tokenFactory.createToken(id);
return t;
}
private Token processEntityOrCharacterRef() {
int ch;
boolean first = true;
Token t = null;
while ((ch = input.read()) != LexerInput.EOF) {
if (ch == ';') {
return first ? error() : createReferenceToken(DTDTokenId.REFERENCE);
}
if (first && ch == '#') {
int r = input.readLength();
t = processCharacterReference();
if (t != null) {
return t;
} else {
input.backup(input.readLength() - r);
return error();
}
}
if (!((first && Character.isLetter(ch)) ||
(!first && (
ch == '-' || ch == '.' || Character.isLetterOrDigit(ch))))) {
// not an entity reference, fall back to normal text processing
input.backup(1);
break;
}
first = false;
}
// end of document inside potential reference
return error();
}
private Token skipComment() {
int ch;
boolean minus = false;
setState(ISI_COMMENT);
while ((ch = input.read()) != LexerInput.EOF) {
if (ch == '-') {
if (minus) {
ch = input.read();
if (ch == '>') { // NOI18N
setState(ISI_INIT);
return tokenFactory.createToken(DTDTokenId.COMMENT);
} else {
intrSubstate = 0;
if (input.readLength() > 2) {
setState(ISI_INIT);
input.backup(2);
return tokenFactory.createToken(DTDTokenId.COMMENT);
} else {
return tokenFactory.createToken(DTDTokenId.ERROR);
}
}
} else {
minus = true;
}
continue;
}
minus = false;
if (ch == '&') {
if (input.readLength() > 1) {
input.backup(1);
return tokenFactory.createToken(DTDTokenId.COMMENT);
}
intrSubstate = saveState();
return processEntityOrCharacterRef();
}
}
intrSubstate = 0;
setState(ISI_INIT);
if (input.readLength() > 0) {
// report entire thing as a comment
input.backup(1);
return tokenFactory.createToken(DTDTokenId.COMMENT);
} else {
return tokenFactory.createToken(DTDTokenId.ERROR);
}
}
private void setState(int state) {
setState(state, 0);
}
private void setState(int state, int subState) {
this.state = state;
this.substate = subState;
intrSubstate = 0;
}
/**
* Regular content.
* Stop at first 'markup' character.
*/
private Token nextTokenContent() {
int ch;
while ((ch = input.read()) != LexerInput.EOF) {
switch (ch) {
case '<':
// retract a bit:
input.backup(1);
setState(ISI_INIT);
return tokenFactory.createToken(DTDTokenId.PLAIN);
case '&':
if (input.readLength() > 1) {
input.backup(1);
setState(ISI_INIT);
return tokenFactory.createToken(DTDTokenId.PLAIN);
}
return processEntityOrCharacterRef();
case '%':
// parsed entity
if (input.readLength() > 1) {
input.backup(1);
setState(ISI_INIT);
return tokenFactory.createToken(DTDTokenId.PLAIN);
}
Token id = processParsedEntity();
if (id != null) {
setState(ISI_INIT);
return id;
}
// fall through:
default:
break;
}
}
setState(ISI_INIT);
if (input.readLength() == 0) {
return null;
}
return tokenFactory.createToken(DTDTokenId.PLAIN);
}
private Token processParsedEntity() {
int ch;
boolean first = true;
while ((ch = input.read()) != LexerInput.EOF) {
if (ch == ';') {
return first ? null : tokenFactory.createToken(DTDTokenId.REFERENCE);
}
if (!((first && Character.isLetter(ch)) ||
(!first && (
ch == '-' || ch == '.' || Character.isLetterOrDigit(ch))))) {
// not an entity reference, fall back to normal text processing:
break;
}
first = false;
}
return null;
}
@Override
public Token nextToken() {
switch (state) {
case ISI_INIT:
return nextTokenInit();
case ISI_COMMENT:
return skipComment();
case ISI_PROCESSING_INSTR:
return nextProcessingInstr();
case ISI_DECLARATION:
return nextDeclaration();
case ISI_ELEMENT:
case ISI_ATTLIST:
return processElementOrAttlist();
case ISI_ENTITY:
case ISI_NOTATION:
break;
}
int ch = input.read();
if (ch != LexerInput.EOF) {
return error();
} else {
return null;
}
}
private Token processElementOrAttlist() {
switch (substate) {
case SUB_VALUE_DOUBLE:
restoreState();
return stringvalue('"');
case SUB_VALUE_QUOTE:
restoreState();
return stringvalue('\'');
}
Token wh = skipWhitespace();
if (wh != null) {
return wh;
}
int ch = input.read();
if (ch == LexerInput.EOF) {
return null;
}
if (ch == '>') {
// terminate definition
setState(ISI_INIT);
return tokenFactory.createToken(DTDTokenId.SYMBOL);
} else if (ch == '&' || ch == '%') {
return processEntityOrCharacterRef();
}
switch (substate) {
case SUB_DECLARATION_NAME:
input.backup(1);
return processDeclarationName();
case SUB_DECLARATION_DEF:
input.backup(1);
return processDeclarationDef();
}
return error();
}
private static final Map DECLARATION_KEYWORDS = new HashMap<>();
/**
* Recognized declaration keywords. A declaration (<!) followed by other name
* will be reported as ERROR token.
*/
static {
DECLARATION_KEYWORDS.put("ELEMENT", ISI_ELEMENT);
DECLARATION_KEYWORDS.put("ATTLIST", ISI_ATTLIST);
DECLARATION_KEYWORDS.put("ENTITY", ISI_ENTITY);
DECLARATION_KEYWORDS.put("NOTATION", ISI_NOTATION);
}
private Token nextDeclaration() {
int ch = input.read();
// process escapes:
switch (ch) {
case '>':
// end of declaration
setState(ISI_INIT);
return tokenFactory.createToken(DTDTokenId.SYMBOL);
case '<':
return unterminatedFoundOpen();
}
if (!Character.isLetter(ch)) {
return error();
}
while ((ch = input.read()) != LexerInput.EOF) {
if (!Character.isLetterOrDigit(ch)) {
if (input.readLength() == 1) {
setState(ISI_INIT);
return error();
}
input.backup(1);
break;
}
}
String name = input.readText().toString();
Integer nState = DECLARATION_KEYWORDS.get(name);
if (nState == null) {
setState(ISI_INIT);
return error();
}
setState(nState);
return tokenFactory.createToken(DTDTokenId.DECLARATION);
}
private Token processDeclarationName() {
int ch;
boolean first = true;
while ((ch = input.read()) != LexerInput.EOF) {
if (first && (ch == '%') && (state == ISI_ENTITY)) {
int n = input.read();
if (Character.isWhitespace(n)) {
// ENTITY % whatever. Emit OPERATOR
return tokenFactory.createToken(DTDTokenId.OPERATOR);
}
input.backup(1);
}
if (ch == '%' || ch == '&') {
Token t = processEntityOrCharacterRef();
substate = SUB_DECLARATION_DEF;
if (t != null) {
return t;
} else {
return error();
}
}
if (!isNametokenChar(ch, first)) {
if (first) {
substate = SUB_DECLARATION_DEF;
return error();
}
input.backup(1);
break;
}
first = false;
}
substate = SUB_DECLARATION_DEF;
return tokenFactory.createToken(DTDTokenId.NAME);
}
/**
* Keywords possible after <!ELEMENT.
*/
private static final Set ELEMENT_KEYWORDS = new HashSet<>();
static {
ELEMENT_KEYWORDS.add("#PCDATA");
ELEMENT_KEYWORDS.add("EMPTY");
ELEMENT_KEYWORDS.add("ANY");
}
/**
* Keywords possible after <!ATTLIST.
*/
private static final Set ATTLIST_KEYWORDS = new HashSet<>();
/**
* Keywords possible after <!NOTATION.
*/
private static final Set NOTATION_KEYWORDS = new HashSet<>();
static {
ATTLIST_KEYWORDS.add("#PCDATA");
ATTLIST_KEYWORDS.add("#REQUIRED");
ATTLIST_KEYWORDS.add("CDATA");
ATTLIST_KEYWORDS.add("ID");
ATTLIST_KEYWORDS.add("IDREF");
ATTLIST_KEYWORDS.add("IDREFS");
ATTLIST_KEYWORDS.add("ENTITY");
ATTLIST_KEYWORDS.add("NMTOKEN");
ATTLIST_KEYWORDS.add("NMTOKENS");
ATTLIST_KEYWORDS.add("NOTATION");
ATTLIST_KEYWORDS.add("#REQUIRED");
ATTLIST_KEYWORDS.add("#IMPLIED");
ATTLIST_KEYWORDS.add("#FIXED");
NOTATION_KEYWORDS.add("SYSTEM");
NOTATION_KEYWORDS.add("PUBLIC");
NOTATION_KEYWORDS.add("NDATA");
}
private Token processDeclarationDef() {
Token wh = skipWhitespace();
if (wh != null) {
return wh;
}
int ch = input.read();
switch (ch) {
case '(': case ')': case '|': case ',': // NOI18N
case '+': // NOI18N
case '*': // NOI18N
case '?': // NOI18N
return tokenFactory.createToken(DTDTokenId.OPERATOR);
case '%': case '&': // NOI18N
return processEntityOrCharacterRef();
case '"': case '\'': // NOI18N
return stringvalue(ch);
default:
if (ch == '#' || Character.isLetter(ch)) { // NOI18N
while ((ch = input.read()) != LexerInput.EOF &&
Character.isLetterOrDigit(ch)) {
// advance
}
input.backup(1);
String t = input.readText().toString();
Set check;
switch (state) {
case ISI_ATTLIST:
check = ATTLIST_KEYWORDS;
break;
case ISI_ELEMENT:
check = ELEMENT_KEYWORDS;
break;
case ISI_ENTITY:
case ISI_NOTATION:
check = NOTATION_KEYWORDS;
break;
default:
check = Collections.emptySet();
}
if (check.contains(t)) {
return tokenFactory.createToken(DTDTokenId.KEYWORD);
} else {
return tokenFactory.createToken(DTDTokenId.NAME);
}
}
break;
}
return error();
}
private Token nextProcessingInstr() {
switch (substate) {
case SUB_VALUE_DOUBLE:
restoreState();
return stringvalue('"');
case SUB_VALUE_QUOTE:
restoreState();
return stringvalue('\'');
}
Token wh = skipWhitespace();
if (wh != null) {
return wh;
}
int ch = input.read();
if (ch == LexerInput.EOF) {
return null;
}
// process escapes to the upper level
switch (ch) {
case '?': // NOI18N
return endProcessingInstruction();
case '>': // NOI18N
setState(ISI_INIT);
return error();
case '<': // NOI18N
return unterminatedFoundOpen();
default:
break;
}
switch (substate) {
case SUB_PROCESSING_TARGET:
input.backup(1);
return nextProcessingTarget();
case SUB_PROCESSING_CONTENT:
input.backup(1);
return nextProcessingContent();
case SUB_PROCESSING_XML:
if (ch == '=') { // NOI18N
// operator
return tokenFactory.createToken(DTDTokenId.OPERATOR);
} else if (ch == '"' || ch == '\'') { // NOI18N
return stringvalue(ch);
} else if (Character.isLetter(ch)) {
// probably a name
input.backup(1);
return processName();
} else {
error();
}
break;
}
return error();
}
private Token stringvalue(int delimiter) {
int ch;
while ((ch = input.read()) != LexerInput.EOF) {
if (ch == delimiter) {
restoreState();
return tokenFactory.createToken(DTDTokenId.STRING);
}
// entity reference in attribute !
if (ch == '&') { // NOI18N
intrSubstate = saveState();
substate = delimiter == '"' ? SUB_VALUE_DOUBLE : SUB_VALUE_QUOTE;
return valueEntityReference();
} else if (ch == '<') { // NOI18N
// ouch ! unterminated value - go to the basic state.
input.backup(1);
break;
}
}
if (input.readLength() > 0) {
return tokenFactory.createToken(DTDTokenId.STRING);
}
setState(ISI_INIT);
return error();
}
private Token valueEntityReference() {
if (input.readLength() > 1) {
// output partial value token
input.backup(1);
return tokenFactory.createToken(DTDTokenId.STRING);
}
return processEntityOrCharacterRef();
}
/**
* Skips whitespaces within declaration, produces DTDToken if whitespace
* is found. Does not change state / substate.
* @return
*/
private Token skipWhitespace() {
int ch;
int start = input.readLength();
while ((ch = input.read()) != LexerInput.EOF) {
if (!Character.isWhitespace(ch)) {
input.backup(1);
if ((input.readLength() - start) > 0) {
return tokenFactory.createToken(DTDTokenId.WS, input.readLength() - start);
}
break;
}
}
if ((input.readLength() - start) > 0) {
return tokenFactory.createToken(DTDTokenId.WS, input.readLength() - start);
} else {
return null;
}
}
private boolean isNametokenChar(int c, boolean first) {
if (first) {
return c == ':' || c == '_' || Character.isLetter(c);
} else {
switch (c) {
case ':': case '_': // NOI18N
case '-': case '.': // NOI18N
return true;
default:
return Character.isLetterOrDigit(c);
}
}
}
private Token error() {
Token t;
if (intrSubstate != 0) {
t = tokenFactory.createToken(DTDTokenId.ERROR, input.readLength(), PartType.MIDDLE);
} else {
t = tokenFactory.createToken(DTDTokenId.ERROR);
}
return t;
}
private Token processName() {
int ch;
boolean first = true;
while ((ch = input.read()) != LexerInput.EOF) {
if (isNametokenChar(ch, first)) {
first = false;
} else {
input.backup(1);
break;
}
}
if (input.readLength() > 0) {
// output a NAME token
return tokenFactory.createToken(DTDTokenId.NAME);
} else if (ch == '?') {
return endProcessingInstruction();
} else {
return error();
}
}
private Token endProcessingInstruction() {
// ? was already consumed
int ch = input.read();
setState(ISI_INIT);
if (ch == '>') {
return tokenFactory.createToken(DTDTokenId.SYMBOL);
} else {
// ? is erroneous
return error();
}
}
private Token nextProcessingTarget() {
int ch;
boolean first = true;
while ((ch = input.read()) != LexerInput.EOF) {
if (ch == '?') { // NOI18N
if (first) {
return nextProcessingContent();
}
substate = SUB_PROCESSING_CONTENT;
input.backup(1);
break;
} else if (Character.isWhitespace(ch)) {
if (first) {
return tokenFactory.createToken(DTDTokenId.ERROR);
}
substate = SUB_PROCESSING_CONTENT;
input.backup(1);
break;
}
if (!((first & Character.isLetter(ch)) ||
(!first && Character.isLetterOrDigit(ch)))) {
// apparently an error:
return tokenFactory.createToken(DTDTokenId.ERROR);
}
first = false;
}
if ("xml".equalsIgnoreCase(input.readText().toString())) { // NOI18N
substate = SUB_PROCESSING_XML;
}
return tokenFactory.createToken(DTDTokenId.TARGET);
}
private Token nextProcessingContent() {
int ch;
boolean white = false;
int whiteStart = -1;
while ((ch = input.read()) != LexerInput.EOF) {
if (ch == '?') {
if (whiteStart > 0) {
return tokenFactory.createToken(DTDTokenId.PI_CONTENT, whiteStart);
}
if (input.readLength() > 1) {
input.backup(1);
return tokenFactory.createToken(DTDTokenId.PI_CONTENT);
}
ch = input.read();
if (ch == '>') {
return tokenFactory.createToken(DTDTokenId.SYMBOL);
}
// exit the processing content
setState(ISI_INIT);
return tokenFactory.createToken(DTDTokenId.ERROR);
}
if (Character.isWhitespace(ch)) {
if (whiteStart == -1) {
whiteStart = input.readLength();
}
if (input.readLength() == 1) {
white = true;
}
} else if (white) {
// report whitespace & continue lexing in the same state:
return tokenFactory.createToken(DTDTokenId.WS);
}
}
return tokenFactory.createToken(DTDTokenId.PI_CONTENT);
}
@Override
public Object state() {
return (state & 0x0f) | ((substate & 0x0f) << 4) | ((intrSubstate & 0xff) << 16);
}
private int saveState() {
return (state & 0x0f) | ((substate & 0x0f) << 4);
}
private void restoreState() {
if (intrSubstate == 0) {
return;
}
int s = intrSubstate;
this.state = s & 0x0f;
this.substate = (s >> 4) & 0x0f;
this.intrSubstate = 0;
}
@Override
public void release() {
}
public DTDLexer(LexerRestartInfo info) {
this.input = info.input();
if (info.state() == null) {
state = ISI_INIT;
substate = 0;
} else {
int s = (Integer)info.state();
this.state = s & 0x0f;
this.substate = (s >> 4) & 0x0f;
this.intrSubstate = (s >> 16) & 0xff;
}
this.tokenFactory = info.tokenFactory();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy