All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.schoolwow.quickhttp.document.parser.HTMLParser Maven / Gradle / Ivy

There is a newer version: 3.0.1
Show newest version
package cn.schoolwow.quickhttp.document.parser;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;

public class HTMLParser {
    private Logger logger = LoggerFactory.getLogger(HTMLParser.class);
    private static final String[] singleNodeList = {"br", "hr", "img", "input", "param", "meta", "link", "!doctype", "?xml", "col"};
    private char[] chars; //输入参数
    private int pos = 0; //当前位置
    private int sectionStart = 0; //token起始位置
    private int lastTokenPos = -1; //上一个可能的Token结束位置
    private String currentTagName; //上一个解析到的标签名称
    private State state = State.openingTag;//起始状态
    private List tokenList = new ArrayList<>(); //Token列表

    public static List parse(String html) {
        return new HTMLParser(html).tokenList;
    }

    private HTMLParser(String html) {
        chars = html.toCharArray();
        parseHTML();
    }

    /**
     * 词法分析
     */
    private void parseHTML() {
        while (pos < chars.length) {
            switch (state) {
                case openingTag: {
                    if (isLastMatch("
                        state = State.inComment;
                        addToken(HTMLToken.TokenType.openTag);
                    } else if (isLastMatch("<") && !isNextMatch("!--")) {
                        //")) {
                        //
currentTagName = addToken(HTMLToken.TokenType.tagName); state = State.closingTag; } else if (chars[pos] == '>') { //判断是否单标签属性 currentTagName = addToken(HTMLToken.TokenType.tagName); if (isSingleNode(currentTagName)) { // state = State.closingTag; } else { // state = State.openTagClosing; } } else if (!Character.isLetterOrDigit((int) chars[pos])) { //非英文字符 currentTagName = addToken(HTMLToken.TokenType.tagName); state = State.inAttribute; } } break; case inComment: { if (isNextMatch("-->")) { // addToken(HTMLToken.TokenType.commentTag); state = State.closingTag; } } break; case inAttribute: { if (chars[pos] == '\"') { state = State.inAttributeDoubleQuote; } else if (chars[pos] == '\'') { state = State.inAttributeSingleQuote; } else if (isNextMatch("/>")) { // addToken(HTMLToken.TokenType.attribute); state = State.closingTag; } else if (isNextMatch(">")) { // addToken(HTMLToken.TokenType.attribute); if (isSingleNode(currentTagName)) { // state = State.closingTag; if (pos == chars.length - 1) { addToken(HTMLToken.TokenType.closeTag); } } else { // state = State.openTagClosing; } } } break; case inAttributeSingleQuote: { if (pos == chars.length - 1 && lastTokenPos > 0) { pos = lastTokenPos; addToken(HTMLToken.TokenType.attribute); if (isSingleNode(currentTagName)) { state = State.closingTag; } else { state = State.openTagClosing; } lastTokenPos = -1; } else if (chars[pos] == '\'') { state = State.inAttribute; } else if (lastTokenPos == -1 && chars[pos] == '>') { lastTokenPos = pos; } } break; case inAttributeDoubleQuote: { if (pos == chars.length - 1 && lastTokenPos > 0) { pos = lastTokenPos; addToken(HTMLToken.TokenType.attribute); if (isSingleNode(currentTagName)) { state = State.closingTag; } else { state = State.openTagClosing; } lastTokenPos = -1; } else if (chars[pos] == '"') { state = State.inAttribute; } else if (lastTokenPos == -1 && chars[pos] == '>') { lastTokenPos = pos; } } break; case openTagClosing: { if (isNextMatch("") || isNextMatch("")) { addToken(HTMLToken.TokenType.textContent); state = State.closingTag; } } break; case inTextContent: { if (isNextMatch("$ addToken(HTMLToken.TokenType.closeTag); break; } else if (isLastMatch(">") && isNextMatch(" addToken(HTMLToken.TokenType.closeTag); state = State.closingTag; } else if (isLastMatch(">") && chars[pos] == '<') { //