com.hn.poi.WordUtil Maven / Gradle / Ivy
package com.hn.poi;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileReader;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HtmlUtil;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.*;
/**
* Word 文档读取工具类
*
* @author fei
*/
public class WordUtil {
/**
* 读取word,同时支持03和07格式
*
* @param path Word文件路径
* @return Word内容
* @throws IOException IOException
*/
public static String read(String path) throws IOException {
return read(FileUtil.file(path));
}
/**
* 读取word,同时支持03和07格式
*
* @param file Word文件
* @return Word内容
* @throws IOException IOException
*/
public static String read(File file) throws IOException {
String fileName = file.getName();
if (fileName.toLowerCase().endsWith("doc")) {
return readBy03(file);
} else if (fileName.toLowerCase().endsWith("docx")) {
return readBy07(file);
} else {
throw new WordException("不支持此种文档解析");
}
}
/**
* 读取Word03
*
* @param file 输入流
* @return Word内容
* @throws IOException IOException
*/
public static String readBy03(File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
try {
return readBy03(fis);
} catch (ArrayIndexOutOfBoundsException e) {
return readByHtml(file);
}
}
/**
* 读取Word03
*
* @param in Word文件流
* @return Word内容
* @throws IOException IOException
*/
public static String readBy03(InputStream in) throws IOException {
WordExtractor doc = new WordExtractor(in);
return doc.getText();
}
/**
* 读取Word07
*
* @param file Word文件
* @return Word内容
* @throws IOException IOException
*/
public static String readBy07(File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
return readBy07(fis);
}
/**
* 读取Word07
*
* @param in Word文件流
* @return Word内容
* @throws IOException IOException
*/
public static String readBy07(InputStream in) throws IOException {
XWPFDocument xdoc = new XWPFDocument(in);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
return extractor.getText();
}
/**
* 读取 html转成的word文档
*
* @param file Word文件
* @return 读取的内容
*/
public static String readByHtml(File file) {
byte[] bytes = FileReader.create(file).readBytes();
String content = null;
try {
content = new String(bytes, "gbk");
} catch (UnsupportedEncodingException e) {
throw new WordException("word读取转换gbk字符失败");
}
if(StrUtil.isBlank(content)){
return null;
}
content = content.substring(content.indexOf("") + 6, content.indexOf(""));
return HtmlUtil.cleanHtmlTag(content);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy