cn.schoolwow.quickhttp.document.parser.HTMLParser Maven / Gradle / Ivy
package cn.schoolwow.quickhttp.document.parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class HTMLParser {
private Logger logger = LoggerFactory.getLogger(HTMLParser.class);
private static final String[] singleNodeList = {"br", "hr", "img", "input", "param", "meta", "link", "!doctype", "?xml", "col"};
private char[] chars; //输入参数
private int pos = 0; //当前位置
private int sectionStart = 0; //token起始位置
private int lastTokenPos = -1; //上一个可能的Token结束位置
private String currentTagName; //上一个解析到的标签名称
private State state = State.openingTag;//起始状态
private List tokenList = new ArrayList<>(); //Token列表
public static List parse(String html) {
return new HTMLParser(html).tokenList;
}
private HTMLParser(String html) {
chars = html.toCharArray();
parseHTML();
}
/**
* 词法分析
*/
private void parseHTML() {
while (pos < chars.length) {
switch (state) {
case openingTag: {
if (isLastMatch("
state = State.inComment;
addToken(HTMLToken.TokenType.openTag);
} else if (isLastMatch("<") && !isNextMatch("!--")) {
// ")) {
//
currentTagName = addToken(HTMLToken.TokenType.tagName);
state = State.closingTag;
} else if (chars[pos] == '>') {
//判断是否单标签属性
currentTagName = addToken(HTMLToken.TokenType.tagName);
if (isSingleNode(currentTagName)) {
//
state = State.closingTag;
} else {
//
state = State.openTagClosing;
}
} else if (!Character.isLetterOrDigit((int) chars[pos])) {
//非英文字符
currentTagName = addToken(HTMLToken.TokenType.tagName);
state = State.inAttribute;
}
}
break;
case inComment: {
if (isNextMatch("-->")) {
//
addToken(HTMLToken.TokenType.commentTag);
state = State.closingTag;
}
}
break;
case inAttribute: {
if (chars[pos] == '\"') {
state = State.inAttributeDoubleQuote;
} else if (chars[pos] == '\'') {
state = State.inAttributeSingleQuote;
} else if (isNextMatch("/>")) {
//
addToken(HTMLToken.TokenType.attribute);
state = State.closingTag;
} else if (isNextMatch(">")) {
//
addToken(HTMLToken.TokenType.attribute);
if (isSingleNode(currentTagName)) {
//
state = State.closingTag;
if (pos == chars.length - 1) {
addToken(HTMLToken.TokenType.closeTag);
}
} else {
//
state = State.openTagClosing;
}
}
}
break;
case inAttributeSingleQuote: {
if (pos == chars.length - 1 && lastTokenPos > 0) {
pos = lastTokenPos;
addToken(HTMLToken.TokenType.attribute);
if (isSingleNode(currentTagName)) {
state = State.closingTag;
} else {
state = State.openTagClosing;
}
lastTokenPos = -1;
} else if (chars[pos] == '\'') {
state = State.inAttribute;
} else if (lastTokenPos == -1 && chars[pos] == '>') {
lastTokenPos = pos;
}
}
break;
case inAttributeDoubleQuote: {
if (pos == chars.length - 1 && lastTokenPos > 0) {
pos = lastTokenPos;
addToken(HTMLToken.TokenType.attribute);
if (isSingleNode(currentTagName)) {
state = State.closingTag;
} else {
state = State.openTagClosing;
}
lastTokenPos = -1;
} else if (chars[pos] == '"') {
state = State.inAttribute;
} else if (lastTokenPos == -1 && chars[pos] == '>') {
lastTokenPos = pos;
}
}
break;
case openTagClosing: {
if (isNextMatch("")) {
state = State.closingTag;
} else if (chars[pos] == '<') {
state = State.openingTag;
} else if ("style".equals(currentTagName) || "script".equals(currentTagName)) {
state = State.inStyleOrScript;
} else {
state = State.inTextContent;
}
addToken(HTMLToken.TokenType.openTagClose);
}
break;
case inStyleOrScript: {
if (isNextMatch("") || isNextMatch("")) {
addToken(HTMLToken.TokenType.textContent);
state = State.closingTag;
}
}
break;
case inTextContent: {
if (isNextMatch("")) {
addToken(HTMLToken.TokenType.textContent);
state = State.closingTag;
} else if (chars[pos] == '<') {
addToken(HTMLToken.TokenType.textContent);
state = State.openingTag;
}
}
break;
case closingTag: {
if (pos == chars.length - 1) {
//$
addToken(HTMLToken.TokenType.closeTag);
break;
} else if (isLastMatch(">") && isNextMatch("")) {
//
addToken(HTMLToken.TokenType.closeTag);
state = State.closingTag;
} else if (isLastMatch(">") && chars[pos] == '<') {
//