All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ddr.poi.html.util.JsoupUtils Maven / Gradle / Ivy

/*
 * Copyright 2016 - 2021 Draco, https://github.com/draco1023
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.ddr.poi.html.util;

import org.ddr.poi.html.HtmlConstants;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.parser.CustomHtmlTreeBuilder;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Predicate;

/**
 * JSoup工具类
 *
 * @author Draco
 * @since 2021-03-03
 */
public class JsoupUtils {
    /**
     * 选取符合条件的子元素到目标集合中
     *
     * @param collection 目标集合
     * @param parent 父元素
     * @param predicate 条件
     */
    public static void selectChildren(Elements collection, Element parent, Predicate predicate) {
        for (Node node : parent.childNodes()) {
            if (node instanceof Element) {
                Element child = ((Element) node);
                if (predicate.test(child)) {
                    collection.add(child);
                }
            }
        }
    }

    /**
     * 选取指定标签的子元素
     *
     * @param parent 父元素
     * @param tag 标签名称,小写
     * @return 子元素集合
     */
    public static Elements children(Element parent, String tag) {
        Elements elements = new Elements();
        selectChildren(elements, parent, c -> c.normalName().equals(tag));
        return elements;
    }

    /**
     * 选取指定标签的子元素
     *
     * @param parent 父元素
     * @param tags 多种标签名称,小写
     * @return 子元素集合
     */
    public static Elements children(Element parent, String... tags) {
        Elements elements = new Elements();
        Set targets = new HashSet<>(Arrays.asList(tags));
        selectChildren(elements, parent, c -> targets.contains(c.normalName()));
        return elements;
    }

    /**
     * 选取第一个指定标签的子元素
     *
     * @param parent 父元素
     * @param tag 标签名称,小写
     * @return 子元素
     */
    public static Element firstChild(Element parent, String tag) {
        for (Node node : parent.childNodes()) {
            if (node instanceof Element) {
                Element child = ((Element) node);
                if (child.normalName().equals(tag)) {
                    return child;
                }
            }
        }
        return null;
    }

    /**
     * 选取表格的所有行元素
     *
     * @param parent 表格元素
     * @return 行元素集合
     */
    public static Elements childRows(Element parent) {
        Elements elements = new Elements();
        for (Node node : parent.childNodes()) {
            if (node instanceof Element) {
                Element child = ((Element) node);
                if (HtmlConstants.TAG_TR.equals(child.normalName())) {
                    // 直接位于table标签下
                    elements.add(child);
                } else {
                    // 可能位于thead/tbody/tfoot标签下,选取直接子元素避免受嵌套表格影响
                    selectChildren(elements, child, c -> HtmlConstants.TAG_TR.equals(c.normalName()));
                }
            }
        }
        return elements;
    }

    /**
     * @see org.jsoup.Jsoup#parseBodyFragment(String)
     * @see org.jsoup.parser.Parser#parseBodyFragment(String, String)
     */
    public static Document parse(String html) {
        CustomHtmlTreeBuilder treeBuilder = new CustomHtmlTreeBuilder();
        return Jsoup.parse(html, new Parser(treeBuilder));
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy