com.hn.poi.WordUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hntool Show documentation
There is a newer version: 1.0.18
package com.hn.poi;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileReader;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HtmlUtil;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.*;

/**
 * Word 文档读取工具类
 *
 * @author fei
 */
public class WordUtil {

    /**
     * 读取word，同时支持03和07格式
     *
     * @param path Word文件路径
     * @return Word内容
     * @throws IOException IOException
     */
    public static String read(String path) throws IOException {
        return read(FileUtil.file(path));
    }

    /**
     * 读取word，同时支持03和07格式
     *
     * @param file Word文件
     * @return Word内容
     * @throws IOException IOException
     */
    public static String read(File file) throws IOException {
        String fileName = file.getName();
        if (fileName.toLowerCase().endsWith("doc")) {
            return readBy03(file);
        } else if (fileName.toLowerCase().endsWith("docx")) {
            return readBy07(file);
        } else {
            throw new WordException("不支持此种文档解析");
        }
    }

    /**
     * 读取Word03
     *
     * @param file 输入流
     * @return Word内容
     * @throws IOException IOException
     */
    public static String readBy03(File file) throws IOException {
        FileInputStream fis = new FileInputStream(file);
        try {
            return readBy03(fis);
        } catch (ArrayIndexOutOfBoundsException e) {
            return readByHtml(file);
        }
    }

    /**
     * 读取Word03
     *
     * @param in Word文件流
     * @return Word内容
     * @throws IOException IOException
     */
    public static String readBy03(InputStream in) throws IOException {
        WordExtractor doc = new WordExtractor(in);
        return doc.getText();
    }

    /**
     * 读取Word07
     *
     * @param file Word文件
     * @return Word内容
     * @throws IOException IOException
     */
    public static String readBy07(File file) throws IOException {
        FileInputStream fis = new FileInputStream(file);
        return readBy07(fis);
    }

    /**
     * 读取Word07
     *
     * @param in Word文件流
     * @return Word内容
     * @throws IOException IOException
     */
    public static String readBy07(InputStream in) throws IOException {
        XWPFDocument xdoc = new XWPFDocument(in);
        XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
        return extractor.getText();
    }

    /**
     * 读取 html转成的word文档
     *
     * @param file Word文件
     * @return 读取的内容
     */
    public static String readByHtml(File file) {
        byte[] bytes = FileReader.create(file).readBytes();
        String content = null;
        try {
            content = new String(bytes, "gbk");
        } catch (UnsupportedEncodingException e) {
            throw new WordException("word读取转换gbk字符失败");
        }
        if(StrUtil.isBlank(content)){
            return null;
        }
        content = content.substring(content.indexOf("") + 6, content.indexOf(""));
        return HtmlUtil.cleanHtmlTag(content);
    }

}