All Downloads are FREE. Search and download functionalities are using the official Maven repository.

starSpider.container.HTMLDocument Maven / Gradle / Ivy

package starSpider.container;

import starSpider.ConstantRegion;
import starSpider.parser.StarSpider;

import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * HTML解析器解析出来的HTML文档对象
 *
 * @author zhao
 */
public class HTMLDocument extends LABELDocument {
    // (?<=)
    public final static Pattern CHILDREN_REX = Pattern.compile("(?<=<)[^/].*?.*?(?=>+?)");
    public final static Pattern REGEXP_INVISIBLE_CHARACTERS_ALL_PATTERN = Pattern.compile(ConstantRegion.REGEXP_INVISIBLE_CHARACTERS_ALL);
    public final String childrenNodeName;
    public final String HTMLData;

    /**
     * 构造一个文档对象
     *
     * @param text     当前LABEL节点的数据
     * @param attribs  当前LABEL节点的属性
     * @param name     当前LABEL的标签名称
     * @param HTMLData 该对象中,原本HTML或XML文档的对象。
     */
    public HTMLDocument(String[] text, HashMap attribs, String name, String HTMLData) {
        super(text, text.length == 0 ? ConstantRegion.STRING_NULL : text[0], attribs, name);
        this.HTMLData = HTMLData;
        Matcher matcher = HTMLDocument.CHILDREN_REX.matcher(this.HTMLData);
        if (matcher.find(2)) {
            this.childrenNodeName = REGEXP_INVISIBLE_CHARACTERS_ALL_PATTERN.split(matcher.group())[0];
        } else this.childrenNodeName = null;
    }

    /**
     * @return 当前节点下的第一个子节点
     * 

* The first child node under the current node */ public HTMLDocument getChildren() { // 获取到子标签 if (this.childrenNodeName == null) return null; return (HTMLDocument) StarSpider.HTML_PARSER.ANodeParse(this.HTMLData, this.childrenNodeName, 2); } /** * 获取当下节点所有子节点中的起名为某个节点名称的所有节点(含当前节点) * * @param childrenNodeName 需要获取到的子节点的节点名字 * @return 当前节点与当前所有子节点中的某一个指定节点数据 */ public HTMLDocument[] getAllChildrenByNodeName(String childrenNodeName) { return (HTMLDocument[]) StarSpider.HTML_PARSER.getDocumentByNodeName(this.HTMLData, false, ConstantRegion.PARSER_NAME_HTML, childrenNodeName); } /** * 获取当下节点所有子节点中 属性为 childrenNodeAttribKey = childrenNodeAttribValue 的所有子节点(含当前节点) * * @param childrenNodeAttribKey 需要获取的子节点的属性名称 * @param childrenNodeAttribValue 需要获取的子节点的属性值 * @return 当前节点与当前所有子节点中的所有符合属性条件的节点文档对象 */ public HTMLDocument[] getAllChildrenByNodeAttrib(String childrenNodeAttribKey, String childrenNodeAttribValue) { return (HTMLDocument[]) StarSpider.HTML_PARSER.getDocumentByNodeAttrib(this.HTMLData, ConstantRegion.PARSER_NAME_HTML, childrenNodeAttribKey, childrenNodeAttribValue); } @Override public String toString() { return this.HTMLData; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy