com.itextpdf.styledxmlparser.jsoup.parser.TokeniserState Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of styled-xml-parser Show documentation
Show all versions of styled-xml-parser Show documentation
Styled XML parser is used by iText modules to parse HTML and XML
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2018 iText Group NV
Authors: iText Software.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License version 3
as published by the Free Software Foundation with the addition of the
following permission added to Section 15 as permitted in Section 7(a):
FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
OF THIRD PARTY RIGHTS
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program; if not, see http://www.gnu.org/licenses or write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
Boston, MA, 02110-1301 USA, or download the license from the following URL:
http://itextpdf.com/terms-of-use/
The interactive user interfaces in modified source and object code versions
of this program must display Appropriate Legal Notices, as required under
Section 5 of the GNU Affero General Public License.
In accordance with Section 7(b) of the GNU Affero General Public License,
a covered work must retain the producer line in every PDF that is created
or manipulated using iText.
You can be released from the requirements of the license by purchasing
a commercial license. Buying such a license is mandatory as soon as you
develop commercial activities involving the iText software without
disclosing the source code of your own applications.
These activities include: offering paid services to customers as an ASP,
serving PDFs on the fly in a web application, shipping iText with a closed
source product.
For more information, please contact iText Software Corp. at this
address: [email protected]
*/
package com.itextpdf.styledxmlparser.jsoup.parser;
import java.util.Arrays;
/**
* States and transition activations for the Tokeniser.
*/
abstract class TokeniserState {
static TokeniserState Data = new TokeniserState() {
@Override
String getName() {
return "Data";
}
// in data state, gather characters until a character reference or tag is found
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '&':
t.advanceTransition(CharacterReferenceInData);
break;
case '<':
t.advanceTransition(TagOpen);
break;
case nullChar:
t.error(this); // NOT replacement character (oddly?)
t.emit(r.consume());
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeData();
t.emit(data);
break;
}
}
};
static TokeniserState CharacterReferenceInData = new TokeniserState() {
@Override
String getName() {
return "CharacterReferenceInData";
}
// from & in data
void read(Tokeniser t, CharacterReader r) {
readCharRef(t, Data);
}
};
static TokeniserState Rcdata = new TokeniserState() {
@Override
String getName() {
return "Rcdata";
}
/// handles data in title, textarea etc
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '&':
t.advanceTransition(CharacterReferenceInRcdata);
break;
case '<':
t.advanceTransition(RcdataLessthanSign);
break;
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeToAny('&', '<', nullChar);
t.emit(data);
break;
}
}
};
static TokeniserState CharacterReferenceInRcdata = new TokeniserState() {
@Override
String getName() {
return "CharacterReferenceInRcdata";
}
void read(Tokeniser t, CharacterReader r) {
readCharRef(t, Rcdata);
}
};
static TokeniserState Rawtext = new TokeniserState() {
@Override
String getName() {
return "Rawtext";
}
void read(Tokeniser t, CharacterReader r) {
readData(t, r, this, RawtextLessthanSign);
}
};
static TokeniserState ScriptData = new TokeniserState() {
@Override
String getName() {
return "ScriptData";
}
void read(Tokeniser t, CharacterReader r) {
readData(t, r, this, ScriptDataLessthanSign);
}
};
static TokeniserState PLAINTEXT = new TokeniserState() {
@Override
String getName() {
return "PLAINTEXT";
}
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeTo(nullChar);
t.emit(data);
break;
}
}
};
static TokeniserState TagOpen = new TokeniserState() {
@Override
String getName() {
return "TagOpen";
}
// from < in data
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '!':
t.advanceTransition(MarkupDeclarationOpen);
break;
case '/':
t.advanceTransition(EndTagOpen);
break;
case '?':
t.advanceTransition(BogusComment);
break;
default:
if (r.matchesLetter()) {
t.createTagPending(true);
t.transition(TagName);
} else {
t.error(this);
t.emit('<'); // char that got us here
t.transition(Data);
}
break;
}
}
};
static TokeniserState EndTagOpen = new TokeniserState() {
@Override
String getName() {
return "EndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.isEmpty()) {
t.eofError(this);
t.emit("");
t.transition(Data);
} else if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(TagName);
} else if (r.matches('>')) {
t.error(this);
t.advanceTransition(Data);
} else {
t.error(this);
t.advanceTransition(BogusComment);
}
}
};
static TokeniserState TagName = new TokeniserState() {
@Override
String getName() {
return "TagName";
}
// from < or in data, will have start or end tag pending
void read(Tokeniser t, CharacterReader r) {
// previous TagOpen state did NOT consume, will have a letter char in current
//String tagName = r.consumeToAnySorted(tagCharsSorted).toLowerCase();
String tagName = r.consumeTagName().toLowerCase();
t.tagPending.appendTagName(tagName);
switch (r.consume()) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar: // replacement
t.tagPending.appendTagName(replacementStr);
break;
case eof: // should emit pending tag?
t.eofError(this);
t.transition(Data);
// no default, as covered with above consumeToAny
}
}
};
static TokeniserState RcdataLessthanSign = new TokeniserState() {
@Override
String getName() {
return "RcdataLessthanSign";
}
// from < in rcdata
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(RCDATAEndTagOpen);
} else if (r.matchesLetter() && t.appropriateEndTagName() != null && !r.containsIgnoreCase("" + t.appropriateEndTagName())) {
// diverge from spec: got a start tag, but there's no appropriate end tag (), so rather than
// consuming to EOF; break out here
t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
} else {
t.emit("<");
t.transition(Rcdata);
}
}
};
static TokeniserState RCDATAEndTagOpen = new TokeniserState() {
@Override
String getName() {
return "RCDATAEndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(Character.toLowerCase(r.current()));
t.dataBuffer.append(Character.toLowerCase(r.current()));
t.advanceTransition(RCDATAEndTagName);
} else {
t.emit("");
t.transition(Rcdata);
}
}
};
static TokeniserState RCDATAEndTagName = new TokeniserState() {
@Override
String getName() {
return "RCDATAEndTagName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.tagPending.appendTagName(name.toLowerCase());
t.dataBuffer.append(name);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
if (t.isAppropriateEndTagToken())
t.transition(BeforeAttributeName);
else
anythingElse(t, r);
break;
case '/':
if (t.isAppropriateEndTagToken())
t.transition(SelfClosingStartTag);
else
anythingElse(t, r);
break;
case '>':
if (t.isAppropriateEndTagToken()) {
t.emitTagPending();
t.transition(Data);
}
else
anythingElse(t, r);
break;
default:
anythingElse(t, r);
}
}
private void anythingElse(Tokeniser t, CharacterReader r) {
t.emit("" + t.dataBuffer.toString());
r.unconsume();
t.transition(Rcdata);
}
};
static TokeniserState RawtextLessthanSign = new TokeniserState() {
@Override
String getName() {
return "RawtextLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(RawtextEndTagOpen);
} else {
t.emit('<');
t.transition(Rawtext);
}
}
};
static TokeniserState RawtextEndTagOpen = new TokeniserState() {
@Override
String getName() {
return "RawtextEndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
readEndTag(t, r, RawtextEndTagName, Rawtext);
}
};
static TokeniserState RawtextEndTagName = new TokeniserState() {
@Override
String getName() {
return "RawtextEndTagName";
}
void read(Tokeniser t, CharacterReader r) {
handleDataEndTag(t, r, Rawtext);
}
};
static TokeniserState ScriptDataLessthanSign = new TokeniserState() {
@Override
String getName() {
return "ScriptDataLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
switch (r.consume()) {
case '/':
t.createTempBuffer();
t.transition(ScriptDataEndTagOpen);
break;
case '!':
t.emit("':
t.emit(c);
t.transition(ScriptData);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataEscaped);
break;
default:
t.emit(c);
t.transition(ScriptDataEscaped);
}
}
};
static TokeniserState ScriptDataEscapedLessthanSign = new TokeniserState() {
@Override
String getName() {
return "ScriptDataEscapedLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTempBuffer();
t.dataBuffer.append(Character.toLowerCase(r.current()));
t.emit("<" + r.current());
t.advanceTransition(ScriptDataDoubleEscapeStart);
} else if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(ScriptDataEscapedEndTagOpen);
} else {
t.emit('<');
t.transition(ScriptDataEscaped);
}
}
};
static TokeniserState ScriptDataEscapedEndTagOpen = new TokeniserState() {
@Override
String getName() {
return "ScriptDataEscapedEndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(Character.toLowerCase(r.current()));
t.dataBuffer.append(r.current());
t.advanceTransition(ScriptDataEscapedEndTagName);
} else {
t.emit("");
t.transition(ScriptDataEscaped);
}
}
};
static TokeniserState ScriptDataEscapedEndTagName = new TokeniserState() {
@Override
String getName() {
return "ScriptDataEscapedEndTagName";
}
void read(Tokeniser t, CharacterReader r) {
handleDataEndTag(t, r, ScriptDataEscaped);
}
};
static TokeniserState ScriptDataDoubleEscapeStart = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapeStart";
}
void read(Tokeniser t, CharacterReader r) {
handleDataDoubleEscapeTag(t, r, ScriptDataDoubleEscaped, ScriptDataEscaped);
}
};
static TokeniserState ScriptDataDoubleEscaped = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscaped";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.current();
switch (c) {
case '-':
t.emit(c);
t.advanceTransition(ScriptDataDoubleEscapedDash);
break;
case '<':
t.emit(c);
t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
break;
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
String data = r.consumeToAny('-', '<', nullChar);
t.emit(data);
}
}
};
static TokeniserState ScriptDataDoubleEscapedDash = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapedDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.emit(c);
t.transition(ScriptDataDoubleEscapedDashDash);
break;
case '<':
t.emit(c);
t.transition(ScriptDataDoubleEscapedLessthanSign);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataDoubleEscaped);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.emit(c);
t.transition(ScriptDataDoubleEscaped);
}
}
};
static TokeniserState ScriptDataDoubleEscapedDashDash = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapedDashDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.emit(c);
break;
case '<':
t.emit(c);
t.transition(ScriptDataDoubleEscapedLessthanSign);
break;
case '>':
t.emit(c);
t.transition(ScriptData);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataDoubleEscaped);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.emit(c);
t.transition(ScriptDataDoubleEscaped);
}
}
};
static TokeniserState ScriptDataDoubleEscapedLessthanSign = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapedLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.emit('/');
t.createTempBuffer();
t.advanceTransition(ScriptDataDoubleEscapeEnd);
} else {
t.transition(ScriptDataDoubleEscaped);
}
}
};
static TokeniserState ScriptDataDoubleEscapeEnd = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapeEnd";
}
void read(Tokeniser t, CharacterReader r) {
handleDataDoubleEscapeTag(t,r, ScriptDataEscaped, ScriptDataDoubleEscaped);
}
};
static TokeniserState BeforeAttributeName = new TokeniserState() {
@Override
String getName() {
return "BeforeAttributeName";
}
// from tagname ':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
case '=':
t.error(this);
t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(AttributeName);
break;
default: // A-Z, anything else
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
}
}
};
static TokeniserState AttributeName = new TokeniserState() {
@Override
String getName() {
return "AttributeName";
}
// from before attribute name
void read(Tokeniser t, CharacterReader r) {
String name = r.consumeToAnySorted(attributeNameCharsSorted);
t.tagPending.appendAttributeName(name.toLowerCase());
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(AfterAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '=':
t.transition(BeforeAttributeValue);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeName(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
t.error(this);
t.tagPending.appendAttributeName(c);
// no default, as covered in consumeToAny
}
}
};
static TokeniserState AfterAttributeName = new TokeniserState() {
@Override
String getName() {
return "AfterAttributeName";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
// ignore
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '=':
t.transition(BeforeAttributeValue);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeName(replacementChar);
t.transition(AttributeName);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
t.error(this);
t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(AttributeName);
break;
default: // A-Z, anything else
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
}
}
};
static TokeniserState BeforeAttributeValue = new TokeniserState() {
@Override
String getName() {
return "BeforeAttributeValue";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
// ignore
break;
case '"':
t.transition(AttributeValue_doubleQuoted);
break;
case '&':
r.unconsume();
t.transition(AttributeValue_unquoted);
break;
case '\'':
t.transition(AttributeValue_singleQuoted);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
t.transition(AttributeValue_unquoted);
break;
case eof:
t.eofError(this);
t.emitTagPending();
t.transition(Data);
break;
case '>':
t.error(this);
t.emitTagPending();
t.transition(Data);
break;
case '<':
case '=':
case '`':
t.error(this);
t.tagPending.appendAttributeValue(c);
t.transition(AttributeValue_unquoted);
break;
default:
r.unconsume();
t.transition(AttributeValue_unquoted);
}
}
};
static TokeniserState AttributeValue_doubleQuoted = new TokeniserState() {
@Override
String getName() {
return "AttributeValue_doubleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAny(attributeDoubleValueCharsSorted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
else
t.tagPending.setEmptyAttributeValue();
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterAttributeValue_quoted);
break;
case '&':
char[] ref = t.consumeCharacterReference('"', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
// no default, handled in consume to any above
}
}
};
static TokeniserState AttributeValue_singleQuoted = new TokeniserState() {
@Override
String getName() {
return "AttributeValue_singleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAny(attributeSingleValueCharsSorted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
else
t.tagPending.setEmptyAttributeValue();
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterAttributeValue_quoted);
break;
case '&':
char[] ref = t.consumeCharacterReference('\'', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
// no default, handled in consume to any above
}
}
};
static TokeniserState AttributeValue_unquoted = new TokeniserState() {
@Override
String getName() {
return "AttributeValue_unquoted";
}
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAnySorted(attributeValueUnquoted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '&':
char[] ref = t.consumeCharacterReference('>', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
case '=':
case '`':
t.error(this);
t.tagPending.appendAttributeValue(c);
break;
// no default, handled in consume to any above
}
}
};
// CharacterReferenceInAttributeValue state handled inline
static TokeniserState AfterAttributeValue_quoted = new TokeniserState() {
@Override
String getName() {
return "AfterAttributeValue_quoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.error(this);
r.unconsume();
t.transition(BeforeAttributeName);
}
}
};
static TokeniserState SelfClosingStartTag = new TokeniserState() {
@Override
String getName() {
return "SelfClosingStartTag";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.tagPending.selfClosing = true;
t.emitTagPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.error(this);
t.transition(BeforeAttributeName);
}
}
};
static TokeniserState BogusComment = new TokeniserState() {
@Override
String getName() {
return "BogusComment";
}
void read(Tokeniser t, CharacterReader r) {
// todo: handle bogus comment starting from eof. when does that trigger?
// rewind to capture character that lead us here
r.unconsume();
Token.Comment comment = new Token.Comment();
comment.bogus = true;
comment.data.append(r.consumeTo('>'));
// todo: replace nullChar with replaceChar
t.emit(comment);
t.advanceTransition(Data);
}
};
static TokeniserState MarkupDeclarationOpen = new TokeniserState() {
@Override
String getName() {
return "MarkupDeclarationOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchConsume("--")) {
t.createCommentPending();
t.transition(CommentStart);
} else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
t.transition(Doctype);
} else if (r.matchConsume("[CDATA[")) {
// todo: should actually check current namepspace, and only non-html allows cdata. until namespace
// is implemented properly, keep handling as cdata
//} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
t.transition(CdataSection);
} else {
t.error(this);
t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
}
}
};
static TokeniserState CommentStart = new TokeniserState() {
@Override
String getName() {
return "CommentStart";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentStartDash);
break;
case nullChar:
t.error(this);
t.commentPending.data.append(replacementChar);
t.transition(Comment);
break;
case '>':
t.error(this);
t.emitCommentPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(c);
t.transition(Comment);
}
}
};
static TokeniserState CommentStartDash = new TokeniserState() {
@Override
String getName() {
return "CommentStartDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentStartDash);
break;
case nullChar:
t.error(this);
t.commentPending.data.append(replacementChar);
t.transition(Comment);
break;
case '>':
t.error(this);
t.emitCommentPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(c);
t.transition(Comment);
}
}
};
static TokeniserState Comment = new TokeniserState() {
@Override
String getName() {
return "Comment";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.current();
switch (c) {
case '-':
t.advanceTransition(CommentEndDash);
break;
case nullChar:
t.error(this);
r.advance();
t.commentPending.data.append(replacementChar);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(r.consumeToAny('-', nullChar));
}
}
};
static TokeniserState CommentEndDash = new TokeniserState() {
@Override
String getName() {
return "CommentEndDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentEnd);
break;
case nullChar:
t.error(this);
t.commentPending.data.append('-').append(replacementChar);
t.transition(Comment);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append('-').append(c);
t.transition(Comment);
}
}
};
static TokeniserState CommentEnd = new TokeniserState() {
@Override
String getName() {
return "CommentEnd";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.emitCommentPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.commentPending.data.append("--").append(replacementChar);
t.transition(Comment);
break;
case '!':
t.error(this);
t.transition(CommentEndBang);
break;
case '-':
t.error(this);
t.commentPending.data.append('-');
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.error(this);
t.commentPending.data.append("--").append(c);
t.transition(Comment);
}
}
};
static TokeniserState CommentEndBang = new TokeniserState() {
@Override
String getName() {
return "CommentEndBang";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.commentPending.data.append("--!");
t.transition(CommentEndDash);
break;
case '>':
t.emitCommentPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.commentPending.data.append("--!").append(replacementChar);
t.transition(Comment);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append("--!").append(c);
t.transition(Comment);
}
}
};
static TokeniserState Doctype = new TokeniserState() {
@Override
String getName() {
return "Doctype";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypeName);
break;
case eof:
t.eofError(this);
// note: fall through to > case
case '>': // catch invalid
t.error(this);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.transition(BeforeDoctypeName);
}
}
};
static TokeniserState BeforeDoctypeName = new TokeniserState() {
@Override
String getName() {
return "BeforeDoctypeName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createDoctypePending();
t.transition(DoctypeName);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break; // ignore whitespace
case nullChar:
t.error(this);
t.createDoctypePending();
t.doctypePending.name.append(replacementChar);
t.transition(DoctypeName);
break;
case eof:
t.eofError(this);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.createDoctypePending();
t.doctypePending.name.append(c);
t.transition(DoctypeName);
}
}
};
static TokeniserState DoctypeName = new TokeniserState() {
@Override
String getName() {
return "DoctypeName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.doctypePending.name.append(name.toLowerCase());
return;
}
char c = r.consume();
switch (c) {
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(AfterDoctypeName);
break;
case nullChar:
t.error(this);
t.doctypePending.name.append(replacementChar);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.name.append(c);
}
}
};
static TokeniserState AfterDoctypeName = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypeName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.isEmpty()) {
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
return;
}
if (r.matchesAny('\t', '\n', '\r', '\f', ' '))
r.advance(); // ignore whitespace
else if (r.matches('>')) {
t.emitDoctypePending();
t.advanceTransition(Data);
} else if (r.matchConsumeIgnoreCase("PUBLIC")) {
t.transition(AfterDoctypePublicKeyword);
} else if (r.matchConsumeIgnoreCase("SYSTEM")) {
t.transition(AfterDoctypeSystemKeyword);
} else {
t.error(this);
t.doctypePending.forceQuirks = true;
t.advanceTransition(BogusDoctype);
}
}
};
static TokeniserState AfterDoctypePublicKeyword = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypePublicKeyword";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypePublicIdentifier);
break;
case '"':
t.error(this);
// set public id to empty string
t.transition(DoctypePublicIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// set public id to empty string
t.transition(DoctypePublicIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState BeforeDoctypePublicIdentifier = new TokeniserState() {
@Override
String getName() {
return "BeforeDoctypePublicIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '"':
// set public id to empty string
t.transition(DoctypePublicIdentifier_doubleQuoted);
break;
case '\'':
// set public id to empty string
t.transition(DoctypePublicIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState DoctypePublicIdentifier_doubleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypePublicIdentifier_doubleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterDoctypePublicIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.publicIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
}
};
static TokeniserState DoctypePublicIdentifier_singleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypePublicIdentifier_singleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterDoctypePublicIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.publicIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
}
};
static TokeniserState AfterDoctypePublicIdentifier = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypePublicIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BetweenDoctypePublicAndSystemIdentifiers);
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState BetweenDoctypePublicAndSystemIdentifiers = new TokeniserState() {
@Override
String getName() {
return "BetweenDoctypePublicAndSystemIdentifiers";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState AfterDoctypeSystemKeyword = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypeSystemKeyword";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypeSystemIdentifier);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
}
}
};
static TokeniserState BeforeDoctypeSystemIdentifier = new TokeniserState() {
@Override
String getName() {
return "BeforeDoctypeSystemIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '"':
// set system id to empty string
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
// set public id to empty string
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState DoctypeSystemIdentifier_doubleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypeSystemIdentifier_doubleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterDoctypeSystemIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.systemIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
}
};
static TokeniserState DoctypeSystemIdentifier_singleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypeSystemIdentifier_singleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterDoctypeSystemIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.systemIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
}
};
static TokeniserState AfterDoctypeSystemIdentifier = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypeSystemIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.transition(BogusDoctype);
// NOT force quirks
}
}
};
static TokeniserState BogusDoctype = new TokeniserState() {
@Override
String getName() {
return "BogusDoctype";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.emitDoctypePending();
t.transition(Data);
break;
default:
// ignore char
break;
}
}
};
static TokeniserState CdataSection = new TokeniserState() {
@Override
String getName() {
return "CdataSection";
}
void read(Tokeniser t, CharacterReader r) {
String data = r.consumeTo("]]>");
t.emit(data);
r.matchConsume("]]>");
t.transition(Data);
}
};
@Override
public String toString() {
return getName();
}
abstract String getName();
abstract void read(Tokeniser t, CharacterReader r);
static final char nullChar = '\u0000';
private static final char[] attributeSingleValueCharsSorted = new char[]{'\'', '&', nullChar};
private static final char[] attributeDoubleValueCharsSorted = new char[]{'"', '&', nullChar};
private static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<'};
private static final char[] attributeValueUnquoted = new char[]{'\t', '\n', '\r', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`'};
private static final char replacementChar = Tokeniser.replacementChar;
private static final String replacementStr = String.valueOf(Tokeniser.replacementChar);
private static final char eof = CharacterReader.EOF;
static {
Arrays.sort(attributeSingleValueCharsSorted);
Arrays.sort(attributeDoubleValueCharsSorted);
Arrays.sort(attributeNameCharsSorted);
Arrays.sort(attributeValueUnquoted);
}
/**
* Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just
* different else exit transitions.
*/
private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.tagPending.appendTagName(name.toLowerCase());
t.dataBuffer.append(name);
return;
}
boolean needsExitTransition = false;
if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
default:
t.dataBuffer.append(c);
needsExitTransition = true;
}
} else {
needsExitTransition = true;
}
if (needsExitTransition) {
t.emit("" + t.dataBuffer.toString());
t.transition(elseTransition);
}
}
private static void readData(Tokeniser t, CharacterReader r, TokeniserState current, TokeniserState advance) {
switch (r.current()) {
case '<':
t.advanceTransition(advance);
break;
case nullChar:
t.error(current);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeToAny('<', nullChar);
t.emit(data);
break;
}
}
private static void readCharRef(Tokeniser t, TokeniserState advance) {
char[] c = t.consumeCharacterReference(null, false);
if (c == null)
t.emit('&');
else
t.emit(c);
t.transition(advance);
}
private static void readEndTag(Tokeniser t, CharacterReader r, TokeniserState a, TokeniserState b) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(a);
} else {
t.emit("");
t.transition(b);
}
}
private static void handleDataDoubleEscapeTag(Tokeniser t, CharacterReader r, TokeniserState primary, TokeniserState fallback) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.dataBuffer.append(name.toLowerCase());
t.emit(name);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
case '/':
case '>':
if (t.dataBuffer.toString().equals("script"))
t.transition(primary);
else
t.transition(fallback);
t.emit(c);
break;
default:
r.unconsume();
t.transition(fallback);
}
}
}