org.jsoup.parser.TokeniserState Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of html2pdf Show documentation
Show all versions of html2pdf Show documentation
pdfHTML is an iText add-on that lets you to parse (X)HTML snippets and the associated CSS and converts
them to PDF.
package org.jsoup.parser;
import java.util.Arrays;
/**
* States and transition activations for the Tokeniser.
*/
abstract class TokeniserState {
static TokeniserState Data = new TokeniserState() {
@Override
String getName() {
return "Data";
}
// in data state, gather characters until a character reference or tag is found
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '&':
t.advanceTransition(CharacterReferenceInData);
break;
case '<':
t.advanceTransition(TagOpen);
break;
case nullChar:
t.error(this); // NOT replacement character (oddly?)
t.emit(r.consume());
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeData();
t.emit(data);
break;
}
}
};
static TokeniserState CharacterReferenceInData = new TokeniserState() {
@Override
String getName() {
return "CharacterReferenceInData";
}
// from & in data
void read(Tokeniser t, CharacterReader r) {
readCharRef(t, Data);
}
};
static TokeniserState Rcdata = new TokeniserState() {
@Override
String getName() {
return "Rcdata";
}
/// handles data in title, textarea etc
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '&':
t.advanceTransition(CharacterReferenceInRcdata);
break;
case '<':
t.advanceTransition(RcdataLessthanSign);
break;
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeToAny('&', '<', nullChar);
t.emit(data);
break;
}
}
};
static TokeniserState CharacterReferenceInRcdata = new TokeniserState() {
@Override
String getName() {
return "CharacterReferenceInRcdata";
}
void read(Tokeniser t, CharacterReader r) {
readCharRef(t, Rcdata);
}
};
static TokeniserState Rawtext = new TokeniserState() {
@Override
String getName() {
return "Rawtext";
}
void read(Tokeniser t, CharacterReader r) {
readData(t, r, this, RawtextLessthanSign);
}
};
static TokeniserState ScriptData = new TokeniserState() {
@Override
String getName() {
return "ScriptData";
}
void read(Tokeniser t, CharacterReader r) {
readData(t, r, this, ScriptDataLessthanSign);
}
};
static TokeniserState PLAINTEXT = new TokeniserState() {
@Override
String getName() {
return "PLAINTEXT";
}
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeTo(nullChar);
t.emit(data);
break;
}
}
};
static TokeniserState TagOpen = new TokeniserState() {
@Override
String getName() {
return "TagOpen";
}
// from < in data
void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '!':
t.advanceTransition(MarkupDeclarationOpen);
break;
case '/':
t.advanceTransition(EndTagOpen);
break;
case '?':
t.advanceTransition(BogusComment);
break;
default:
if (r.matchesLetter()) {
t.createTagPending(true);
t.transition(TagName);
} else {
t.error(this);
t.emit('<'); // char that got us here
t.transition(Data);
}
break;
}
}
};
static TokeniserState EndTagOpen = new TokeniserState() {
@Override
String getName() {
return "EndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.isEmpty()) {
t.eofError(this);
t.emit("");
t.transition(Data);
} else if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(TagName);
} else if (r.matches('>')) {
t.error(this);
t.advanceTransition(Data);
} else {
t.error(this);
t.advanceTransition(BogusComment);
}
}
};
static TokeniserState TagName = new TokeniserState() {
@Override
String getName() {
return "TagName";
}
// from < or in data, will have start or end tag pending
void read(Tokeniser t, CharacterReader r) {
// previous TagOpen state did NOT consume, will have a letter char in current
//String tagName = r.consumeToAnySorted(tagCharsSorted).toLowerCase();
String tagName = r.consumeTagName().toLowerCase();
t.tagPending.appendTagName(tagName);
switch (r.consume()) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar: // replacement
t.tagPending.appendTagName(replacementStr);
break;
case eof: // should emit pending tag?
t.eofError(this);
t.transition(Data);
// no default, as covered with above consumeToAny
}
}
};
static TokeniserState RcdataLessthanSign = new TokeniserState() {
@Override
String getName() {
return "RcdataLessthanSign";
}
// from < in rcdata
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(RCDATAEndTagOpen);
} else if (r.matchesLetter() && t.appropriateEndTagName() != null && !r.containsIgnoreCase("" + t.appropriateEndTagName())) {
// diverge from spec: got a start tag, but there's no appropriate end tag (), so rather than
// consuming to EOF; break out here
t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
} else {
t.emit("<");
t.transition(Rcdata);
}
}
};
static TokeniserState RCDATAEndTagOpen = new TokeniserState() {
@Override
String getName() {
return "RCDATAEndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(Character.toLowerCase(r.current()));
t.dataBuffer.append(Character.toLowerCase(r.current()));
t.advanceTransition(RCDATAEndTagName);
} else {
t.emit("");
t.transition(Rcdata);
}
}
};
static TokeniserState RCDATAEndTagName = new TokeniserState() {
@Override
String getName() {
return "RCDATAEndTagName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.tagPending.appendTagName(name.toLowerCase());
t.dataBuffer.append(name);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
if (t.isAppropriateEndTagToken())
t.transition(BeforeAttributeName);
else
anythingElse(t, r);
break;
case '/':
if (t.isAppropriateEndTagToken())
t.transition(SelfClosingStartTag);
else
anythingElse(t, r);
break;
case '>':
if (t.isAppropriateEndTagToken()) {
t.emitTagPending();
t.transition(Data);
}
else
anythingElse(t, r);
break;
default:
anythingElse(t, r);
}
}
private void anythingElse(Tokeniser t, CharacterReader r) {
t.emit("" + t.dataBuffer.toString());
r.unconsume();
t.transition(Rcdata);
}
};
static TokeniserState RawtextLessthanSign = new TokeniserState() {
@Override
String getName() {
return "RawtextLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(RawtextEndTagOpen);
} else {
t.emit('<');
t.transition(Rawtext);
}
}
};
static TokeniserState RawtextEndTagOpen = new TokeniserState() {
@Override
String getName() {
return "RawtextEndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
readEndTag(t, r, RawtextEndTagName, Rawtext);
}
};
static TokeniserState RawtextEndTagName = new TokeniserState() {
@Override
String getName() {
return "RawtextEndTagName";
}
void read(Tokeniser t, CharacterReader r) {
handleDataEndTag(t, r, Rawtext);
}
};
static TokeniserState ScriptDataLessthanSign = new TokeniserState() {
@Override
String getName() {
return "ScriptDataLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
switch (r.consume()) {
case '/':
t.createTempBuffer();
t.transition(ScriptDataEndTagOpen);
break;
case '!':
t.emit("':
t.emit(c);
t.transition(ScriptData);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataEscaped);
break;
default:
t.emit(c);
t.transition(ScriptDataEscaped);
}
}
};
static TokeniserState ScriptDataEscapedLessthanSign = new TokeniserState() {
@Override
String getName() {
return "ScriptDataEscapedLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTempBuffer();
t.dataBuffer.append(Character.toLowerCase(r.current()));
t.emit("<" + r.current());
t.advanceTransition(ScriptDataDoubleEscapeStart);
} else if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(ScriptDataEscapedEndTagOpen);
} else {
t.emit('<');
t.transition(ScriptDataEscaped);
}
}
};
static TokeniserState ScriptDataEscapedEndTagOpen = new TokeniserState() {
@Override
String getName() {
return "ScriptDataEscapedEndTagOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(Character.toLowerCase(r.current()));
t.dataBuffer.append(r.current());
t.advanceTransition(ScriptDataEscapedEndTagName);
} else {
t.emit("");
t.transition(ScriptDataEscaped);
}
}
};
static TokeniserState ScriptDataEscapedEndTagName = new TokeniserState() {
@Override
String getName() {
return "ScriptDataEscapedEndTagName";
}
void read(Tokeniser t, CharacterReader r) {
handleDataEndTag(t, r, ScriptDataEscaped);
}
};
static TokeniserState ScriptDataDoubleEscapeStart = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapeStart";
}
void read(Tokeniser t, CharacterReader r) {
handleDataDoubleEscapeTag(t, r, ScriptDataDoubleEscaped, ScriptDataEscaped);
}
};
static TokeniserState ScriptDataDoubleEscaped = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscaped";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.current();
switch (c) {
case '-':
t.emit(c);
t.advanceTransition(ScriptDataDoubleEscapedDash);
break;
case '<':
t.emit(c);
t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
break;
case nullChar:
t.error(this);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
String data = r.consumeToAny('-', '<', nullChar);
t.emit(data);
}
}
};
static TokeniserState ScriptDataDoubleEscapedDash = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapedDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.emit(c);
t.transition(ScriptDataDoubleEscapedDashDash);
break;
case '<':
t.emit(c);
t.transition(ScriptDataDoubleEscapedLessthanSign);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataDoubleEscaped);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.emit(c);
t.transition(ScriptDataDoubleEscaped);
}
}
};
static TokeniserState ScriptDataDoubleEscapedDashDash = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapedDashDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.emit(c);
break;
case '<':
t.emit(c);
t.transition(ScriptDataDoubleEscapedLessthanSign);
break;
case '>':
t.emit(c);
t.transition(ScriptData);
break;
case nullChar:
t.error(this);
t.emit(replacementChar);
t.transition(ScriptDataDoubleEscaped);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.emit(c);
t.transition(ScriptDataDoubleEscaped);
}
}
};
static TokeniserState ScriptDataDoubleEscapedLessthanSign = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapedLessthanSign";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.emit('/');
t.createTempBuffer();
t.advanceTransition(ScriptDataDoubleEscapeEnd);
} else {
t.transition(ScriptDataDoubleEscaped);
}
}
};
static TokeniserState ScriptDataDoubleEscapeEnd = new TokeniserState() {
@Override
String getName() {
return "ScriptDataDoubleEscapeEnd";
}
void read(Tokeniser t, CharacterReader r) {
handleDataDoubleEscapeTag(t,r, ScriptDataEscaped, ScriptDataDoubleEscaped);
}
};
static TokeniserState BeforeAttributeName = new TokeniserState() {
@Override
String getName() {
return "BeforeAttributeName";
}
// from tagname ':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
case '=':
t.error(this);
t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(AttributeName);
break;
default: // A-Z, anything else
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
}
}
};
static TokeniserState AttributeName = new TokeniserState() {
@Override
String getName() {
return "AttributeName";
}
// from before attribute name
void read(Tokeniser t, CharacterReader r) {
String name = r.consumeToAnySorted(attributeNameCharsSorted);
t.tagPending.appendAttributeName(name.toLowerCase());
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(AfterAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '=':
t.transition(BeforeAttributeValue);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeName(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
t.error(this);
t.tagPending.appendAttributeName(c);
// no default, as covered in consumeToAny
}
}
};
static TokeniserState AfterAttributeName = new TokeniserState() {
@Override
String getName() {
return "AfterAttributeName";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
// ignore
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '=':
t.transition(BeforeAttributeValue);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeName(replacementChar);
t.transition(AttributeName);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
t.error(this);
t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(AttributeName);
break;
default: // A-Z, anything else
t.tagPending.newAttribute();
r.unconsume();
t.transition(AttributeName);
}
}
};
static TokeniserState BeforeAttributeValue = new TokeniserState() {
@Override
String getName() {
return "BeforeAttributeValue";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
// ignore
break;
case '"':
t.transition(AttributeValue_doubleQuoted);
break;
case '&':
r.unconsume();
t.transition(AttributeValue_unquoted);
break;
case '\'':
t.transition(AttributeValue_singleQuoted);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
t.transition(AttributeValue_unquoted);
break;
case eof:
t.eofError(this);
t.emitTagPending();
t.transition(Data);
break;
case '>':
t.error(this);
t.emitTagPending();
t.transition(Data);
break;
case '<':
case '=':
case '`':
t.error(this);
t.tagPending.appendAttributeValue(c);
t.transition(AttributeValue_unquoted);
break;
default:
r.unconsume();
t.transition(AttributeValue_unquoted);
}
}
};
static TokeniserState AttributeValue_doubleQuoted = new TokeniserState() {
@Override
String getName() {
return "AttributeValue_doubleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAny(attributeDoubleValueCharsSorted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
else
t.tagPending.setEmptyAttributeValue();
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterAttributeValue_quoted);
break;
case '&':
char[] ref = t.consumeCharacterReference('"', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
// no default, handled in consume to any above
}
}
};
static TokeniserState AttributeValue_singleQuoted = new TokeniserState() {
@Override
String getName() {
return "AttributeValue_singleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAny(attributeSingleValueCharsSorted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
else
t.tagPending.setEmptyAttributeValue();
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterAttributeValue_quoted);
break;
case '&':
char[] ref = t.consumeCharacterReference('\'', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
// no default, handled in consume to any above
}
}
};
static TokeniserState AttributeValue_unquoted = new TokeniserState() {
@Override
String getName() {
return "AttributeValue_unquoted";
}
void read(Tokeniser t, CharacterReader r) {
String value = r.consumeToAnySorted(attributeValueUnquoted);
if (value.length() > 0)
t.tagPending.appendAttributeValue(value);
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '&':
char[] ref = t.consumeCharacterReference('>', true);
if (ref != null)
t.tagPending.appendAttributeValue(ref);
else
t.tagPending.appendAttributeValue('&');
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeValue(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
case '"':
case '\'':
case '<':
case '=':
case '`':
t.error(this);
t.tagPending.appendAttributeValue(c);
break;
// no default, handled in consume to any above
}
}
};
// CharacterReferenceInAttributeValue state handled inline
static TokeniserState AfterAttributeValue_quoted = new TokeniserState() {
@Override
String getName() {
return "AfterAttributeValue_quoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.error(this);
r.unconsume();
t.transition(BeforeAttributeName);
}
}
};
static TokeniserState SelfClosingStartTag = new TokeniserState() {
@Override
String getName() {
return "SelfClosingStartTag";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.tagPending.selfClosing = true;
t.emitTagPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.transition(Data);
break;
default:
t.error(this);
t.transition(BeforeAttributeName);
}
}
};
static TokeniserState BogusComment = new TokeniserState() {
@Override
String getName() {
return "BogusComment";
}
void read(Tokeniser t, CharacterReader r) {
// todo: handle bogus comment starting from eof. when does that trigger?
// rewind to capture character that lead us here
r.unconsume();
Token.Comment comment = new Token.Comment();
comment.bogus = true;
comment.data.append(r.consumeTo('>'));
// todo: replace nullChar with replaceChar
t.emit(comment);
t.advanceTransition(Data);
}
};
static TokeniserState MarkupDeclarationOpen = new TokeniserState() {
@Override
String getName() {
return "MarkupDeclarationOpen";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchConsume("--")) {
t.createCommentPending();
t.transition(CommentStart);
} else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
t.transition(Doctype);
} else if (r.matchConsume("[CDATA[")) {
// todo: should actually check current namepspace, and only non-html allows cdata. until namespace
// is implemented properly, keep handling as cdata
//} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
t.transition(CdataSection);
} else {
t.error(this);
t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
}
}
};
static TokeniserState CommentStart = new TokeniserState() {
@Override
String getName() {
return "CommentStart";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentStartDash);
break;
case nullChar:
t.error(this);
t.commentPending.data.append(replacementChar);
t.transition(Comment);
break;
case '>':
t.error(this);
t.emitCommentPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(c);
t.transition(Comment);
}
}
};
static TokeniserState CommentStartDash = new TokeniserState() {
@Override
String getName() {
return "CommentStartDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentStartDash);
break;
case nullChar:
t.error(this);
t.commentPending.data.append(replacementChar);
t.transition(Comment);
break;
case '>':
t.error(this);
t.emitCommentPending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(c);
t.transition(Comment);
}
}
};
static TokeniserState Comment = new TokeniserState() {
@Override
String getName() {
return "Comment";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.current();
switch (c) {
case '-':
t.advanceTransition(CommentEndDash);
break;
case nullChar:
t.error(this);
r.advance();
t.commentPending.data.append(replacementChar);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append(r.consumeToAny('-', nullChar));
}
}
};
static TokeniserState CommentEndDash = new TokeniserState() {
@Override
String getName() {
return "CommentEndDash";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.transition(CommentEnd);
break;
case nullChar:
t.error(this);
t.commentPending.data.append('-').append(replacementChar);
t.transition(Comment);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append('-').append(c);
t.transition(Comment);
}
}
};
static TokeniserState CommentEnd = new TokeniserState() {
@Override
String getName() {
return "CommentEnd";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.emitCommentPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.commentPending.data.append("--").append(replacementChar);
t.transition(Comment);
break;
case '!':
t.error(this);
t.transition(CommentEndBang);
break;
case '-':
t.error(this);
t.commentPending.data.append('-');
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.error(this);
t.commentPending.data.append("--").append(c);
t.transition(Comment);
}
}
};
static TokeniserState CommentEndBang = new TokeniserState() {
@Override
String getName() {
return "CommentEndBang";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '-':
t.commentPending.data.append("--!");
t.transition(CommentEndDash);
break;
case '>':
t.emitCommentPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.commentPending.data.append("--!").append(replacementChar);
t.transition(Comment);
break;
case eof:
t.eofError(this);
t.emitCommentPending();
t.transition(Data);
break;
default:
t.commentPending.data.append("--!").append(c);
t.transition(Comment);
}
}
};
static TokeniserState Doctype = new TokeniserState() {
@Override
String getName() {
return "Doctype";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypeName);
break;
case eof:
t.eofError(this);
// note: fall through to > case
case '>': // catch invalid
t.error(this);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.transition(BeforeDoctypeName);
}
}
};
static TokeniserState BeforeDoctypeName = new TokeniserState() {
@Override
String getName() {
return "BeforeDoctypeName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
t.createDoctypePending();
t.transition(DoctypeName);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break; // ignore whitespace
case nullChar:
t.error(this);
t.createDoctypePending();
t.doctypePending.name.append(replacementChar);
t.transition(DoctypeName);
break;
case eof:
t.eofError(this);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.createDoctypePending();
t.doctypePending.name.append(c);
t.transition(DoctypeName);
}
}
};
static TokeniserState DoctypeName = new TokeniserState() {
@Override
String getName() {
return "DoctypeName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.doctypePending.name.append(name.toLowerCase());
return;
}
char c = r.consume();
switch (c) {
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(AfterDoctypeName);
break;
case nullChar:
t.error(this);
t.doctypePending.name.append(replacementChar);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.name.append(c);
}
}
};
static TokeniserState AfterDoctypeName = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypeName";
}
void read(Tokeniser t, CharacterReader r) {
if (r.isEmpty()) {
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
return;
}
if (r.matchesAny('\t', '\n', '\r', '\f', ' '))
r.advance(); // ignore whitespace
else if (r.matches('>')) {
t.emitDoctypePending();
t.advanceTransition(Data);
} else if (r.matchConsumeIgnoreCase("PUBLIC")) {
t.transition(AfterDoctypePublicKeyword);
} else if (r.matchConsumeIgnoreCase("SYSTEM")) {
t.transition(AfterDoctypeSystemKeyword);
} else {
t.error(this);
t.doctypePending.forceQuirks = true;
t.advanceTransition(BogusDoctype);
}
}
};
static TokeniserState AfterDoctypePublicKeyword = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypePublicKeyword";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypePublicIdentifier);
break;
case '"':
t.error(this);
// set public id to empty string
t.transition(DoctypePublicIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// set public id to empty string
t.transition(DoctypePublicIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState BeforeDoctypePublicIdentifier = new TokeniserState() {
@Override
String getName() {
return "BeforeDoctypePublicIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '"':
// set public id to empty string
t.transition(DoctypePublicIdentifier_doubleQuoted);
break;
case '\'':
// set public id to empty string
t.transition(DoctypePublicIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState DoctypePublicIdentifier_doubleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypePublicIdentifier_doubleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterDoctypePublicIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.publicIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
}
};
static TokeniserState DoctypePublicIdentifier_singleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypePublicIdentifier_singleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterDoctypePublicIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.publicIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
}
};
static TokeniserState AfterDoctypePublicIdentifier = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypePublicIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BetweenDoctypePublicAndSystemIdentifiers);
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState BetweenDoctypePublicAndSystemIdentifiers = new TokeniserState() {
@Override
String getName() {
return "BetweenDoctypePublicAndSystemIdentifiers";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState AfterDoctypeSystemKeyword = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypeSystemKeyword";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeDoctypeSystemIdentifier);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case '"':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
t.error(this);
// system id empty
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
}
}
};
static TokeniserState BeforeDoctypeSystemIdentifier = new TokeniserState() {
@Override
String getName() {
return "BeforeDoctypeSystemIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '"':
// set system id to empty string
t.transition(DoctypeSystemIdentifier_doubleQuoted);
break;
case '\'':
// set public id to empty string
t.transition(DoctypeSystemIdentifier_singleQuoted);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.doctypePending.forceQuirks = true;
t.transition(BogusDoctype);
}
}
};
static TokeniserState DoctypeSystemIdentifier_doubleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypeSystemIdentifier_doubleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '"':
t.transition(AfterDoctypeSystemIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.systemIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
}
};
static TokeniserState DoctypeSystemIdentifier_singleQuoted = new TokeniserState() {
@Override
String getName() {
return "DoctypeSystemIdentifier_singleQuoted";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\'':
t.transition(AfterDoctypeSystemIdentifier);
break;
case nullChar:
t.error(this);
t.doctypePending.systemIdentifier.append(replacementChar);
break;
case '>':
t.error(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
}
};
static TokeniserState AfterDoctypeSystemIdentifier = new TokeniserState() {
@Override
String getName() {
return "AfterDoctypeSystemIdentifier";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
break;
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.eofError(this);
t.doctypePending.forceQuirks = true;
t.emitDoctypePending();
t.transition(Data);
break;
default:
t.error(this);
t.transition(BogusDoctype);
// NOT force quirks
}
}
};
static TokeniserState BogusDoctype = new TokeniserState() {
@Override
String getName() {
return "BogusDoctype";
}
void read(Tokeniser t, CharacterReader r) {
char c = r.consume();
switch (c) {
case '>':
t.emitDoctypePending();
t.transition(Data);
break;
case eof:
t.emitDoctypePending();
t.transition(Data);
break;
default:
// ignore char
break;
}
}
};
static TokeniserState CdataSection = new TokeniserState() {
@Override
String getName() {
return "CdataSection";
}
void read(Tokeniser t, CharacterReader r) {
String data = r.consumeTo("]]>");
t.emit(data);
r.matchConsume("]]>");
t.transition(Data);
}
};
@Override
public String toString() {
return getName();
}
abstract String getName();
abstract void read(Tokeniser t, CharacterReader r);
static final char nullChar = '\u0000';
private static final char[] attributeSingleValueCharsSorted = new char[]{'\'', '&', nullChar};
private static final char[] attributeDoubleValueCharsSorted = new char[]{'"', '&', nullChar};
private static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<'};
private static final char[] attributeValueUnquoted = new char[]{'\t', '\n', '\r', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`'};
private static final char replacementChar = Tokeniser.replacementChar;
private static final String replacementStr = String.valueOf(Tokeniser.replacementChar);
private static final char eof = CharacterReader.EOF;
static {
Arrays.sort(attributeSingleValueCharsSorted);
Arrays.sort(attributeDoubleValueCharsSorted);
Arrays.sort(attributeNameCharsSorted);
Arrays.sort(attributeValueUnquoted);
}
/**
* Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just
* different else exit transitions.
*/
private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.tagPending.appendTagName(name.toLowerCase());
t.dataBuffer.append(name);
return;
}
boolean needsExitTransition = false;
if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
t.transition(BeforeAttributeName);
break;
case '/':
t.transition(SelfClosingStartTag);
break;
case '>':
t.emitTagPending();
t.transition(Data);
break;
default:
t.dataBuffer.append(c);
needsExitTransition = true;
}
} else {
needsExitTransition = true;
}
if (needsExitTransition) {
t.emit("" + t.dataBuffer.toString());
t.transition(elseTransition);
}
}
private static void readData(Tokeniser t, CharacterReader r, TokeniserState current, TokeniserState advance) {
switch (r.current()) {
case '<':
t.advanceTransition(advance);
break;
case nullChar:
t.error(current);
r.advance();
t.emit(replacementChar);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeToAny('<', nullChar);
t.emit(data);
break;
}
}
private static void readCharRef(Tokeniser t, TokeniserState advance) {
char[] c = t.consumeCharacterReference(null, false);
if (c == null)
t.emit('&');
else
t.emit(c);
t.transition(advance);
}
private static void readEndTag(Tokeniser t, CharacterReader r, TokeniserState a, TokeniserState b) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(a);
} else {
t.emit("");
t.transition(b);
}
}
private static void handleDataDoubleEscapeTag(Tokeniser t, CharacterReader r, TokeniserState primary, TokeniserState fallback) {
if (r.matchesLetter()) {
String name = r.consumeLetterSequence();
t.dataBuffer.append(name.toLowerCase());
t.emit(name);
return;
}
char c = r.consume();
switch (c) {
case '\t':
case '\n':
case '\r':
case '\f':
case ' ':
case '/':
case '>':
if (t.dataBuffer.toString().equals("script"))
t.transition(primary);
else
t.transition(fallback);
t.emit(c);
break;
default:
r.unconsume();
t.transition(fallback);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy