org.jsoup.parser.TokeniserState Maven / Gradle / Ivy
package org.jsoup.parser;
import org.jsoup.nodes.DocumentType;
import java.util.Arrays;
/**
* States and transition activations for the Tokeniser.
*/
enum TokeniserState {
Data {
// in data state, gather characters until a character reference or tag is found
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '&':
t.advanceTransition(CharacterReferenceInData);
break;
case '<':
t.advanceTransition(TagOpen);
break;
case nullChar:
t.error(this); // NOT replacement character (oddly?)
t.emit(r.consume());
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeData();
t.emit(data);
break;
}
}
},
CharacterReferenceInData {
// from & in data
void read(Tokeniser t, CharacterReader r) {
readCharRef(t, Data);
}
},
Rcdata {
/// handles data in title, textarea etc
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '&':
t.advanceTransition(CharacterReferenceInRcdata);
break;
case '<':
t.advanceTransition(RcdataLessthanSign);
break;
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeToAny('&', '<', nullChar);
t.emit(data);
break;
}
}
},
CharacterReferenceInRcdata {
void read(Tokeniser t, CharacterReader r) {
readCharRef(t, Rcdata);
}
},
Rawtext {
void read(Tokeniser t, CharacterReader r) {
readData(t, r, this, RawtextLessthanSign);
}
},
ScriptData {
void read(Tokeniser t, CharacterReader r) {
readData(t, r, this, ScriptDataLessthanSign);
}
},
PLAINTEXT {
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeTo(nullChar);
t.emit(data);
break;
}
}
},
TagOpen {
// from < in data
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '!':
t.advanceTransition(MarkupDeclarationOpen);
break;
case '/':
t.advanceTransition(EndTagOpen);
break;
case '?':
t.advanceTransition(BogusComment);
break;
default:
if (r.matchesLetter()) {
t.createTagPending(true);
t.transition(TagName);
} else {
t.error(this);
t.emit('<'); // char that got us here
t.transition(Data);
}
break;
}
}
},
EndTagOpen {
void read(Tokeniser t, CharacterReader r) {
if (r.isEmpty()) {
t.eofError(this);
t.emit("");
t.transition(Data);
} else if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(TagName);
} else if (r.matches('>')) {
t.error(this);
t.advanceTransition(Data);
} else {
t.error(this);
t.advanceTransition(BogusComment);
}
}
},
TagName {
// from < or in data, will have start or end tag pending
void read(Tokeniser t, CharacterReader r) {
// previous TagOpen state did NOT consume, will have a letter char in current
//String tagName = r.consumeToAnySorted(tagCharsSorted).toLowerCase();
String tagName = r.consumeTagName();
t.tagPending.appendTagName(tagName);
switch (r.consume()) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar: // replacement
t.tagPending.appendTagName(replacementStr);
break;
case eof: // should emit pending tag?
t.eofError(this);
t.transition(Data);
// no default, as covered with above consumeToAny
}
}
},
RcdataLessthanSign {
// from < in rcdata
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(RCDATAEndTagOpen);
} else if (r.matchesLetter() && t.appropriateEndTagName() != null && !r.containsIgnoreCase("" + t.appropriateEndTagName())) {
// diverge from spec: got a start tag, but there's no appropriate end tag (), so rather than
// consuming to EOF; break out here
t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
} else {
t.emit("<");
t.transition(Rcdata);
}
}
},
RCDATAEndTagOpen {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(r.current());
t.dataBuffer.append(r.current());
t.advanceTransition(RCDATAEndTagName);
} else {
t.emit("");
t.transition(Rcdata);
}
}
},
RCDATAEndTagName {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.tagPending.appendTagName(name);
t.dataBuffer.append(name);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
if (t.isAppropriateEndTagToken())
t.transition(BeforeAttributeName);
else
anythingElse(t, r);
break;
case '/':
if (t.isAppropriateEndTagToken())
t.transition(SelfClosingStartTag);
else
anythingElse(t, r);
break;
case '>':
if (t.isAppropriateEndTagToken()) {
t.emitTagPending();
t.transition(Data);
}
else
anythingElse(t, r);
break;
default:
anythingElse(t, r);
}
}
private void anythingElse(Tokeniser t, CharacterReader r) {
t.emit("" + t.dataBuffer.toString());
r.unconsume();
t.transition(Rcdata);
}
},
RawtextLessthanSign {
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(RawtextEndTagOpen);
} else {
t.emit('<');
t.transition(Rawtext);
}
}
},
RawtextEndTagOpen {
void read(Tokeniser t, CharacterReader r) {
readEndTag(t, r, RawtextEndTagName, Rawtext);
}
},
RawtextEndTagName {
void read(Tokeniser t, CharacterReader r) {
handleDataEndTag(t, r, Rawtext);
}
},
ScriptDataLessthanSign {
void read(Tokeniser t, CharacterReader r) {
switch (r.consume()) {
case '/':
t.createTempBuffer();
t.transition(ScriptDataEndTagOpen);
break;
case '!':
t.emit("':
t.emit(c);
t.transition(ScriptData);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataEscaped);
break;
default:
t.emit(c);
t.transition(ScriptDataEscaped);
}
}
},
ScriptDataEscapedLessthanSign {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTempBuffer();
t.dataBuffer.append(r.current());
t.emit("<" + r.current());
t.advanceTransition(ScriptDataDoubleEscapeStart);
} else if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(ScriptDataEscapedEndTagOpen);
} else {
t.emit('<');
t.transition(ScriptDataEscaped);
}
}
},
ScriptDataEscapedEndTagOpen {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(r.current());
t.dataBuffer.append(r.current());
t.advanceTransition(ScriptDataEscapedEndTagName);
} else {
t.emit("");
t.transition(ScriptDataEscaped);
}
}
},
ScriptDataEscapedEndTagName {
void read(Tokeniser t, CharacterReader r) {
handleDataEndTag(t, r, ScriptDataEscaped);
}
},
ScriptDataDoubleEscapeStart {
void read(Tokeniser t, CharacterReader r) {
handleDataDoubleEscapeTag(t, r, ScriptDataDoubleEscaped, ScriptDataEscaped);
}
},
ScriptDataDoubleEscaped {
void read(Tokeniser t, CharacterReader r) {
char c = r.current();
switch (c) {
case '-':
t.emit(c);
t.advanceTransition(ScriptDataDoubleEscapedDash);
break;
case '<':
t.emit(c);
t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
break;
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
String data = r.consumeToAny('-', '<', nullChar);
t.emit(data);
}
}
},
ScriptDataDoubleEscapedDash {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.emit(c);
t.transition(ScriptDataDoubleEscapedDashDash);
break;
case '<':
t.emit(c);
t.transition(ScriptDataDoubleEscapedLessthanSign);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataDoubleEscaped);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.emit(c);
t.transition(ScriptDataDoubleEscaped);
}
}
},
ScriptDataDoubleEscapedDashDash {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.emit(c);
break;
case '<':
t.emit(c);
t.transition(ScriptDataDoubleEscapedLessthanSign);
break;
case '>':
t.emit(c);
t.transition(ScriptData);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataDoubleEscaped);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.emit(c);
t.transition(ScriptDataDoubleEscaped);
}
}
},
ScriptDataDoubleEscapedLessthanSign {
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.emit('/');
t.createTempBuffer();
t.advanceTransition(ScriptDataDoubleEscapeEnd);
} else {
t.transition(ScriptDataDoubleEscaped);
}
}
},
ScriptDataDoubleEscapeEnd {
void read(Tokeniser t, CharacterReader r) {
handleDataDoubleEscapeTag(t,r, ScriptDataEscaped, ScriptDataDoubleEscaped);
}
},
BeforeAttributeName {
// from tagname ':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
case '=':
t.error(this);
t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(AttributeName);
break;
default: // A-Z, anything else
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
}
}
},
AttributeName {
// from before attribute name
void read(Tokeniser t, CharacterReader r) {
String name = r.consumeToAnySorted(attributeNameCharsSorted);
t.tagPending.appendAttributeName(name);
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(AfterAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '=':
t.transition(BeforeAttributeValue);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeName(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
t.error(this);
t.tagPending.appendAttributeName(c);
// no default, as covered in consumeToAny
}
}
},
AfterAttributeName {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
// ignore
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '=':
t.transition(BeforeAttributeValue);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeName(replacementChar);
t.transition(AttributeName);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
t.error(this);
t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(AttributeName);
break;
default: // A-Z, anything else
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
}
}
},
BeforeAttributeValue {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
// ignore
break;
case '"':
t.transition(AttributeValue_doubleQuoted);
break;
case '&':
r.unconsume();
t.transition(AttributeValue_unquoted);
break;
case '\'':
t.transition(AttributeValue_singleQuoted);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
t.transition(AttributeValue_unquoted);
break;
case eof:
t.eofError(this);
t.emitTagPending();
t.transition(Data);
break;
case '>':
t.error(this);
t.emitTagPending();
t.transition(Data);
break;
case '<':
case '=':
case '`':
t.error(this);
t.tagPending.appendAttributeValue(c);
t.transition(AttributeValue_unquoted);
break;
default:
r.unconsume();
t.transition(AttributeValue_unquoted);
}
}
},
AttributeValue_doubleQuoted {
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAny(attributeDoubleValueCharsSorted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
else
t.tagPending.setEmptyAttributeValue();
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterAttributeValue_quoted);
break;
case '&':
int[] ref = t.consumeCharacterReference('"', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
// no default, handled in consume to any above
}
}
},
AttributeValue_singleQuoted {
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAny(attributeSingleValueCharsSorted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
else
t.tagPending.setEmptyAttributeValue();
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterAttributeValue_quoted);
break;
case '&':
int[] ref = t.consumeCharacterReference('\'', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
// no default, handled in consume to any above
}
}
},
AttributeValue_unquoted {
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAnySorted(attributeValueUnquoted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '&':
int[] ref = t.consumeCharacterReference('>', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
case '=':
case '`':
t.error(this);
t.tagPending.appendAttributeValue(c);
break;
// no default, handled in consume to any above
}
}
},
// CharacterReferenceInAttributeValue state handled inline
AfterAttributeValue_quoted {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.error(this);
r.unconsume();
t.transition(BeforeAttributeName);
}
}
},
SelfClosingStartTag {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.tagPending.selfClosing = true;
t.emitTagPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.error(this);
r.unconsume();
t.transition(BeforeAttributeName);
}
}
},
BogusComment {
void read(Tokeniser t, CharacterReader r) {
// todo: handle bogus comment starting from eof. when does that trigger?
// rewind to capture character that lead us here
r.unconsume();
Token.Comment comment = new Token.Comment();
comment.bogus = true;
comment.data.append(r.consumeTo('>'));
// todo: replace nullChar with replaceChar
t.emit(comment);
t.advanceTransition(Data);
}
},
MarkupDeclarationOpen {
void read(Tokeniser t, CharacterReader r) {
if (r.matchConsume("--")) {
t.createCommentPending();
t.transition(CommentStart);
} else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
t.transition(Doctype);
} else if (r.matchConsume("[CDATA[")) {
// todo: should actually check current namepspace, and only non-html allows cdata. until namespace
// is implemented properly, keep handling as cdata
//} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
t.transition(CdataSection);
} else {
t.error(this);
t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
}
}
},
CommentStart {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentStartDash);
break;
case nullChar:
t.error(this);
t.commentPending.data.append(replacementChar);
t.transition(Comment);
break;
case '>':
t.error(this);
t.emitCommentPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(c);
t.transition(Comment);
}
}
},
CommentStartDash {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentStartDash);
break;
case nullChar:
t.error(this);
t.commentPending.data.append(replacementChar);
t.transition(Comment);
break;
case '>':
t.error(this);
t.emitCommentPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(c);
t.transition(Comment);
}
}
},
Comment {
void read(Tokeniser t, CharacterReader r) {
char c = r.current();
switch (c) {
case '-':
t.advanceTransition(CommentEndDash);
break;
case nullChar:
t.error(this);
r.advance();
t.commentPending.data.append(replacementChar);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(r.consumeToAny('-', nullChar));
}
}
},
CommentEndDash {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentEnd);
break;
case nullChar:
t.error(this);
t.commentPending.data.append('-').append(replacementChar);
t.transition(Comment);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append('-').append(c);
t.transition(Comment);
}
}
},
CommentEnd {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.emitCommentPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.commentPending.data.append("--").append(replacementChar);
t.transition(Comment);
break;
case '!':
t.error(this);
t.transition(CommentEndBang);
break;
case '-':
t.error(this);
t.commentPending.data.append('-');
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.error(this);
t.commentPending.data.append("--").append(c);
t.transition(Comment);
}
}
},
CommentEndBang {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.commentPending.data.append("--!");
t.transition(CommentEndDash);
break;
case '>':
t.emitCommentPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.commentPending.data.append("--!").append(replacementChar);
t.transition(Comment);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append("--!").append(c);
t.transition(Comment);
}
}
},
Doctype {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypeName);
break;
case eof:
t.eofError(this);
// note: fall through to > case
case '>': // catch invalid
t.error(this);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.transition(BeforeDoctypeName);
}
}
},
BeforeDoctypeName {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createDoctypePending();
t.transition(DoctypeName);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break; // ignore whitespace
case nullChar:
t.error(this);
t.createDoctypePending();
t.doctypePending.name.append(replacementChar);
t.transition(DoctypeName);
break;
case eof:
t.eofError(this);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.createDoctypePending();
t.doctypePending.name.append(c);
t.transition(DoctypeName);
}
}
},
DoctypeName {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.doctypePending.name.append(name);
return;
}
char c = r.consume();
switch (c) {
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(AfterDoctypeName);
break;
case nullChar:
t.error(this);
t.doctypePending.name.append(replacementChar);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.name.append(c);
}
}
},
AfterDoctypeName {
void read(Tokeniser t, CharacterReader r) {
if (r.isEmpty()) {
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
return;
}
if (r.matchesAny('\t', '\n', '\r', '\f', ' '))
r.advance(); // ignore whitespace
else if (r.matches('>')) {
t.emitDoctypePending();
t.advanceTransition(Data);
} else if (r.matchConsumeIgnoreCase(DocumentType.PUBLIC_KEY)) {
t.doctypePending.pubSysKey = DocumentType.PUBLIC_KEY;
t.transition(AfterDoctypePublicKeyword);
} else if (r.matchConsumeIgnoreCase(DocumentType.SYSTEM_KEY)) {
t.doctypePending.pubSysKey = DocumentType.SYSTEM_KEY;
t.transition(AfterDoctypeSystemKeyword);
} else {
t.error(this);
t.doctypePending.forceQuirks = true;
t.advanceTransition(BogusDoctype);
}
}
},
AfterDoctypePublicKeyword {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypePublicIdentifier);
break;
case '"':
t.error(this);
// set public id to empty string
t.transition(DoctypePublicIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// set public id to empty string
t.transition(DoctypePublicIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
},
BeforeDoctypePublicIdentifier {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '"':
// set public id to empty string
t.transition(DoctypePublicIdentifier_doubleQuoted);
break;
case '\'':
// set public id to empty string
t.transition(DoctypePublicIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
},
DoctypePublicIdentifier_doubleQuoted {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterDoctypePublicIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.publicIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
}
},
DoctypePublicIdentifier_singleQuoted {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterDoctypePublicIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.publicIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
}
},
AfterDoctypePublicIdentifier {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BetweenDoctypePublicAndSystemIdentifiers);
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
},
BetweenDoctypePublicAndSystemIdentifiers {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
},
AfterDoctypeSystemKeyword {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypeSystemIdentifier);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
}
}
},
BeforeDoctypeSystemIdentifier {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '"':
// set system id to empty string
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
// set public id to empty string
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
},
DoctypeSystemIdentifier_doubleQuoted {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterDoctypeSystemIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.systemIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
}
},
DoctypeSystemIdentifier_singleQuoted {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterDoctypeSystemIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.systemIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
}
},
AfterDoctypeSystemIdentifier {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.transition(BogusDoctype);
// NOT force quirks
}
}
},
BogusDoctype {
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.emitDoctypePending();
t.transition(Data);
break;
default:
// ignore char
break;
}
}
},
CdataSection {
void read(Tokeniser t, CharacterReader r) {
String data = r.consumeTo("]]>");
t.emit(data);
r.matchConsume("]]>");
t.transition(Data);
}
};
abstract void read(Tokeniser t, CharacterReader r);
static final char nullChar = '\u0000';
private static final char[] attributeSingleValueCharsSorted = new char[]{'\'', '&', nullChar};
private static final char[] attributeDoubleValueCharsSorted = new char[]{'"', '&', nullChar};
private static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<'};
private static final char[] attributeValueUnquoted = new char[]{'\t', '\n', '\r', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`'};
private static final char replacementChar = Tokeniser.replacementChar;
private static final String replacementStr = String.valueOf(Tokeniser.replacementChar);
private static final char eof = CharacterReader.EOF;
static {
Arrays.sort(attributeSingleValueCharsSorted);
Arrays.sort(attributeDoubleValueCharsSorted);
Arrays.sort(attributeNameCharsSorted);
Arrays.sort(attributeValueUnquoted);
}
/**
* Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just
* different else exit transitions.
*/
private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.tagPending.appendTagName(name);
t.dataBuffer.append(name);
return;
}
boolean needsExitTransition = false;
if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
default:
t.dataBuffer.append(c);
needsExitTransition = true;
}
} else {
needsExitTransition = true;
}
if (needsExitTransition) {
t.emit("" + t.dataBuffer.toString());
t.transition(elseTransition);
}
}
private static void readData(Tokeniser t, CharacterReader r, TokeniserState current, TokeniserState advance) {
switch (r.current()) {
case '<':
t.advanceTransition(advance);
break;
case nullChar:
t.error(current);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeToAny('<', nullChar);
t.emit(data);
break;
}
}
private static void readCharRef(Tokeniser t, TokeniserState advance) {
int[] c = t.consumeCharacterReference(null, false);
if (c == null)
t.emit('&');
else
t.emit(c);
t.transition(advance);
}
private static void readEndTag(Tokeniser t, CharacterReader r, TokeniserState a, TokeniserState b) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(a);
} else {
t.emit("");
t.transition(b);
}
}
private static void handleDataDoubleEscapeTag(Tokeniser t, CharacterReader r, TokeniserState primary, TokeniserState fallback) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.dataBuffer.append(name);
t.emit(name);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
case '/':
case '>':
if (t.dataBuffer.toString().equals("script"))
t.transition(primary);
else
t.transition(fallback);
t.emit(c);
break;
default:
r.unconsume();
t.transition(fallback);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy