All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.xiaoyuge5201.file.DealHtmlContentUtil Maven / Gradle / Ivy

There is a newer version: 1.3.5
Show newest version
package com.github.xiaoyuge5201.file;

import java.util.regex.Pattern;

/**
 * 获取富文本中的中文内容
 * @author yugb
 */
public class DealHtmlContentUtil {

    /**
     * @param html html内容
     * @return 文本内容
     */
    public static String html2Text(String html) {
        String textStr = "";
        Pattern p_script;
        java.util.regex.Matcher m_script;
        Pattern p_style;
        java.util.regex.Matcher m_style;
        Pattern p_html;
        java.util.regex.Matcher m_html;
        try {
            // 定义script的正则表达式{或]*?>[\\s\\S]*?<\\/script>
            String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";
            // 定义style的正则表达式{或]*?>[\\s\\S]*?<\\/style>
            String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";
            String regEx_html = "<[^>]+>";
            // 定义HTML标签的正则表达式
            p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
            m_script = p_script.matcher(html);
            // 过滤script标签
            html = m_script.replaceAll("");
            p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
            m_style = p_style.matcher(html);
            // 过滤style标签
            html = m_style.replaceAll("");
            p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
            m_html = p_html.matcher(html);
            html = m_html.replaceAll("");
            // 过滤html标签
            textStr = html;
        } catch (Exception e) {
            System.err.println("Html2Text: " + e.getMessage());
        }
        //剔除空格行
        textStr = textStr.replaceAll("[ ]+", " ");
        textStr = textStr.replaceAll("(?m)^\\s*$(\\n|\\r\\n)", "");
        return textStr;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy